[llvm-branch-commits] [llvm] [AMDGPU] DPP wave reduction for long types - 2 (PR #189225)

via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Sun Mar 29 04:09:26 PDT 2026


https://github.com/easyonaadit created https://github.com/llvm/llvm-project/pull/189225

Supported Ops: `add`, `sub`

>From 0f973c85618ef5acba7ffd84a3c22cc37e38c347 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Thu, 26 Mar 2026 16:23:41 +0530
Subject: [PATCH] [AMDGPU] DPP wave reduction for long types - 2

Supported Ops: `add`, `sub`
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   93 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll  | 1221 ++++++++++++++--
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll  | 1259 +++++++++++++++--
 3 files changed, 2265 insertions(+), 308 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index dfec8aaf56767..4eb6bad007d59 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5736,6 +5736,8 @@ getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST) {
   case AMDGPU::V_CMP_LT_I64_e64: // min.i64
   case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
   case AMDGPU::V_CMP_GT_I64_e64: // max.i64
+  case AMDGPU::S_ADD_U64_PSEUDO:
+  case AMDGPU::S_SUB_U64_PSEUDO:
     DPPOpc = AMDGPU::V_MOV_B64_DPP_PSEUDO;
     break;
   default:
@@ -5745,7 +5747,10 @@ getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST) {
   if (!ST.getInstrInfo()->isVALU(Opc)) {
     if (Opc == AMDGPU::S_SUB_I32)
       ClampOpc = AMDGPU::S_ADD_I32;
-    ClampOpc = ST.getInstrInfo()->getVALUOp(ClampOpc);
+    if (Opc == AMDGPU::S_ADD_U64_PSEUDO || Opc == AMDGPU::S_SUB_U64_PSEUDO)
+      ClampOpc = AMDGPU::V_ADD_CO_U32_e64;
+    else
+      ClampOpc = ST.getInstrInfo()->getVALUOp(ClampOpc);
   }
   return {DPPOpc, ClampOpc};
 }
@@ -6301,40 +6306,80 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
           DPPInstr.addImm(SISrcMods::NONE); // src1 modifier
         if (!NeedsMovDPP)
           DPPInstr.addReg(Src); // src1
+        if (AMDGPU::getNamedOperandIdx(DPPOpc, AMDGPU::OpName::clamp) >= 0)
+          DPPInstr.addImm(0); // clamp
         DPPInstr
             .addImm(DPPCtrl) // dpp-ctrl
             .addImm(0xf)     // row-mask
             .addImm(0xf)     // bank-mask
             .addImm(0);      // bound-control
       };
-      auto BuildClampInstr = [&](Register Dst, Register Src0, Register Src1) {
-        auto ClampInstr = BuildMI(*CurrBB, MI, DL, TII->get(ClampOpc), Dst);
+      auto BuildClampInstr = [&](Register Dst, Register Src0, Register Src1,
+                                 bool isAddSub = false,
+                                 bool needsCarryIn = false,
+                                 Register CarryIn = Register()) {
+        unsigned InstrOpc = ClampOpc;
+        Register CarryOutReg = MRI.createVirtualRegister(WaveMaskRegClass);
+        if (needsCarryIn)
+          InstrOpc = AMDGPU::V_ADDC_U32_e64;
+        auto ClampInstr = BuildMI(*CurrBB, MI, DL, TII->get(InstrOpc), Dst);
         if (isFPOp)
           ClampInstr.addImm(SISrcMods::NONE); // src0 mod
+        if (isAddSub) {
+          if (needsCarryIn)
+            ClampInstr.addReg(CarryOutReg,
+                              RegState::Define |
+                                  RegState::Dead); // killed carry-out reg
+          else
+            ClampInstr.addReg(CarryOutReg, RegState::Define); // carry-out reg
+        }
         ClampInstr.addReg(Src0);              // src0
         if (isFPOp)
           ClampInstr.addImm(SISrcMods::NONE); // src1 mod
         ClampInstr.addReg(Src1);              // src1
-        if (TII->hasIntClamp(*ClampInstr) || TII->hasFPClamp(*ClampInstr))
+        if (needsCarryIn)
+          ClampInstr.addReg(CarryIn, RegState::Kill); // carry-in reg
+        if (AMDGPU::getNamedOperandIdx(InstrOpc, AMDGPU::OpName::clamp) >= 0)
           ClampInstr.addImm(0); // clamp
         if (isFPOp)
           ClampInstr.addImm(0); // omod
         LastBcastInstr = ClampInstr;
+        return CarryOutReg;
       };
       auto BuildPostDPPInstr = [&](Register Src0, Register Src1) {
-        Register CmpMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
-        Register MinMaxResultReg = MRI.createVirtualRegister(SrcRegClass);
-        BuildMI(*CurrBB, MI, DL, TII->get(Opc), CmpMaskReg)
-            .addReg(Src0)  // src0
-            .addReg(Src1); // src1
-        LastBcastInstr =
-            BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B64_PSEUDO),
-                    MinMaxResultReg)
-                .addReg(Src1)        // src0
-                .addReg(Src0)        // src1
-                .addReg(CmpMaskReg); // src2
-        CurrBB = Expand64BitV_CND_MASK(*LastBcastInstr, CurrBB);
-        return MinMaxResultReg;
+        bool isAddSubOpc =
+            Opc == AMDGPU::S_ADD_U64_PSEUDO || Opc == AMDGPU::S_SUB_U64_PSEUDO;
+        Register ReturnReg = MRI.createVirtualRegister(SrcRegClass);
+        if (isAddSubOpc) {
+          Register ResLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+          Register ResHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+          MachineOperand Src0Operand =
+              MachineOperand::CreateReg(Src0, /*isDef=*/false);
+          MachineOperand Src1Operand =
+              MachineOperand::CreateReg(Src1, /*isDef=*/false);
+          auto [Src0Lo, Src0Hi] = ExtractSubRegs(MI, Src0Operand, SrcRegClass);
+          auto [Src1Lo, Src1Hi] = ExtractSubRegs(MI, Src1Operand, SrcRegClass);
+          Register CarryReg =
+              BuildClampInstr(ResLo, Src0Lo.getReg(), Src1Lo.getReg(),
+                              isAddSubOpc, /*needsCarryIn*/ false);
+          BuildClampInstr(ResHi, Src0Hi.getReg(), Src1Hi.getReg(), isAddSubOpc,
+                          /*needsCarryIn*/ isAddSubOpc ? true : false,
+                          CarryReg);
+          BuildRegSequence(*CurrBB, MI, ReturnReg, ResLo, ResHi);
+        } else {
+          Register CmpMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
+          BuildMI(*CurrBB, MI, DL, TII->get(Opc), CmpMaskReg)
+              .addReg(Src0)  // src0
+              .addReg(Src1); // src1
+          LastBcastInstr =
+              BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B64_PSEUDO),
+                      ReturnReg)
+                  .addReg(Src1)        // src0
+                  .addReg(Src0)        // src1
+                  .addReg(CmpMaskReg); // src2
+          CurrBB = Expand64BitV_CND_MASK(*LastBcastInstr, CurrBB);
+        }
+        return ReturnReg;
       };
 
       // Set inactive lanes to the identity value.
@@ -6549,14 +6594,22 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
         BuildRegSequence(*CurrBB, MI, ReducedValSGPR, LaneValueLoReg,
                          LaneValueHiReg);
       }
-      if (Opc == AMDGPU::S_SUB_I32)
+      if (Opc == AMDGPU::S_SUB_I32) {
         BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedReducedVal)
             .addImm(0)
             .addReg(ReducedValSGPR);
+      } else if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
+        auto NegatedValInstr =
+            BuildMI(*CurrBB, MI, DL, TII->get(Opc), NegatedReducedVal)
+                .addImm(0)
+                .addReg(ReducedValSGPR);
+        CurrBB = Expand64BitScalarArithmetic(*NegatedValInstr, CurrBB);
+      }
       // Mark the final result as a whole-wave-mode calculation.
       BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::STRICT_WWM), DstReg)
-          .addReg(Opc == AMDGPU::S_SUB_I32 ? NegatedReducedVal
-                                           : ReducedValSGPR);
+          .addReg(Opc == AMDGPU::S_SUB_I32 || Opc == AMDGPU::S_SUB_U64_PSEUDO
+                      ? NegatedReducedVal
+                      : ReducedValSGPR);
       RetBB = CurrBB;
     }
   }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
index d7fa9bc800634..b78b580d3ed59 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
@@ -1009,6 +1009,943 @@ entry:
   ret void
 }
 
+define void @divergent_value_dpp_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_dpp_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX8GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_dpp_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX9GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1064DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v8, 4, v8
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1064DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v6, v8, v4
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v7, v8, v5
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1064DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Reload
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1064GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v8, 4, v8
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1064GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT:    ds_permute_b32 v6, v8, v4
+; GFX1064GISEL-NEXT:    ds_permute_b32 v7, v8, v5
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1064GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX1064GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Reload
+; GFX1064GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1064GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064GISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s6
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s6
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032DAGISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032DAGISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032DAGISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032DAGISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1032DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s4, v4, 31
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s5, v5, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s6
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1032GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032GISEL-NEXT:    v_readlane_b32 s4, v4, 31
+; GFX1032GISEL-NEXT:    v_readlane_b32 s5, v5, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1032GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1032GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Spill
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v4, s32
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:4
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:8
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v7, s32 offset:12
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v8, s32 offset:16
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1164DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v8, 4, v8
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT:    s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT:    s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT:    s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT:    s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v6, v8, v4
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v7, v8, v5
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT:    s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s2, v4, 63
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s3, v5, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1164DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Reload
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v4, off, s32
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v8, off, s32 offset:16
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Spill
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v4, s32
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:4
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:8
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v7, s32 offset:12
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v8, s32 offset:16
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1164GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v8, 4, v8
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT:    s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT:    s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT:    s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT:    s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT:    ds_permute_b32 v6, v8, v4
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT:    ds_permute_b32 v7, v8, v5
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT:    s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s2, v4, 63
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT:    v_readlane_b32 s3, v5, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1164GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164GISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Reload
+; GFX1164GISEL-NEXT:    scratch_load_b32 v4, off, s32
+; GFX1164GISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX1164GISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX1164GISEL-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX1164GISEL-NEXT:    scratch_load_b32 v8, off, s32 offset:16
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
+; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v4, s32
+; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:4
+; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:8
+; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v7, s32 offset:12
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1132DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s0, v4, 31
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s1, v5, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1132DAGISEL-NEXT:    scratch_load_b32 v4, off, s32
+; GFX1132DAGISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX1132DAGISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX1132DAGISEL-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
+; GFX1132GISEL-NEXT:    scratch_store_b32 off, v4, s32
+; GFX1132GISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:4
+; GFX1132GISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:8
+; GFX1132GISEL-NEXT:    scratch_store_b32 off, v7, s32 offset:12
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1132GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s0, v4, 31
+; GFX1132GISEL-NEXT:    v_readlane_b32 s1, v5, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1132GISEL-NEXT:    scratch_load_b32 v4, off, s32
+; GFX1132GISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX1132GISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX1132GISEL-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX12DAGISEL:       ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
+; GFX12DAGISEL-NEXT:    scratch_store_b32 off, v4, s32
+; GFX12DAGISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:4
+; GFX12DAGISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:8
+; GFX12DAGISEL-NEXT:    scratch_store_b32 off, v7, s32 offset:12
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX12DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s2
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX12DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX12DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX12DAGISEL-NEXT:    s_wait_dscnt 0x1
+; GFX12DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX12DAGISEL-NEXT:    s_wait_dscnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT:    v_readlane_b32 s0, v4, 31
+; GFX12DAGISEL-NEXT:    v_readlane_b32 s1, v5, 31
+; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX12DAGISEL-NEXT:    scratch_load_b32 v4, off, s32
+; GFX12DAGISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX12DAGISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX12DAGISEL-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 %in, i32 2)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) {
 ; GFX8DAGISEL-LABEL: default_stratergy:
 ; GFX8DAGISEL:       ; %bb.0: ; %entry
@@ -1375,7 +2312,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1383,24 +2320,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX8DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX8DAGISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8DAGISEL-NEXT:    s_add_i32 s6, s6, s8
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX8DAGISEL-NEXT:  ; %bb.5:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1415,7 +2352,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1423,20 +2360,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mul_i32 s6, s6, s2
-; GFX8GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8GISEL-NEXT:    s_add_i32 s6, s6, s8
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX8GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX8GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
@@ -1452,7 +2389,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1460,24 +2397,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX9DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX9DAGISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9DAGISEL-NEXT:    s_add_i32 s6, s6, s8
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX9DAGISEL-NEXT:  ; %bb.5:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1491,7 +2428,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1499,20 +2436,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mul_i32 s6, s6, s2
-; GFX9GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9GISEL-NEXT:    s_add_i32 s6, s6, s8
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX9GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX9GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -1527,7 +2464,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1535,24 +2472,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX1064DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1064DAGISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064DAGISEL-NEXT:    s_add_i32 s6, s6, s8
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX1064DAGISEL-NEXT:  ; %bb.5:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX1064DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1566,7 +2503,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1574,20 +2511,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mul_i32 s6, s6, s2
-; GFX1064GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064GISEL-NEXT:    s_add_i32 s6, s6, s8
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX1064GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1064GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -1602,7 +2539,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr1
 ; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032DAGISEL-NEXT:    s_load_dword s1, s[4:5], 0x2c
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -1610,24 +2547,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
-; GFX1032DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1032DAGISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1032DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032DAGISEL-NEXT:    s_add_i32 s1, s1, s6
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX1032DAGISEL-NEXT:  ; %bb.5:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX1032DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1641,7 +2578,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr0
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -1649,20 +2586,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mul_i32 s0, s0, s2
-; GFX1032GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX1032GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032GISEL-NEXT:    s_add_i32 s0, s0, s6
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX1032GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1032GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -1679,7 +2616,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164DAGISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1688,25 +2625,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX1164DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1164DAGISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1164DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164DAGISEL-NEXT:    s_add_i32 s6, s6, s8
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX1164DAGISEL-NEXT:  ; %bb.5:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX1164DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1722,7 +2659,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1731,21 +2668,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mul_i32 s6, s6, s2
-; GFX1164GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1164GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164GISEL-NEXT:    s_add_i32 s6, s6, s8
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX1164GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1164GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -1762,7 +2699,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132DAGISEL-NEXT:    s_load_b32 s1, s[4:5], 0x2c
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -1771,25 +2708,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
-; GFX1132DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1132DAGISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1132DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_add_i32 s1, s1, s6
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX1132DAGISEL-NEXT:  ; %bb.5:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX1132DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1805,7 +2742,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -1814,21 +2751,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mul_i32 s0, s0, s2
-; GFX1132GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX1132GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_add_i32 s0, s0, s6
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX1132GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1132GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
@@ -1844,7 +2781,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX12DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX12DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX12DAGISEL-NEXT:    s_load_b32 s1, s[4:5], 0x2c
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -1853,15 +2790,15 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX12DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
 ; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
-; GFX12DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX12DAGISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX12DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX12DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
@@ -1869,10 +2806,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX12DAGISEL-NEXT:    s_add_co_i32 s1, s1, s6
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX12DAGISEL-NEXT:  ; %bb.5:
 ; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX12DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX12DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2120,7 +3057,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -2128,7 +3065,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s8
 ; GFX8DAGISEL-NEXT:    s_addc_u32 s5, s5, s10
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX8DAGISEL-NEXT:  ; %bb.2:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2141,7 +3078,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX8GISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -2149,7 +3086,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s8
 ; GFX8GISEL-NEXT:    s_addc_u32 s5, s5, s10
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX8GISEL-NEXT:  ; %bb.2:
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2162,7 +3099,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -2170,7 +3107,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s8
 ; GFX9DAGISEL-NEXT:    s_addc_u32 s5, s5, s10
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9DAGISEL-NEXT:  ; %bb.2:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2183,7 +3120,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX9GISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -2191,7 +3128,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s8
 ; GFX9GISEL-NEXT:    s_addc_u32 s5, s5, s10
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9GISEL-NEXT:  ; %bb.2:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2204,7 +3141,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -2212,7 +3149,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064DAGISEL-NEXT:    s_add_u32 s4, s4, s9
 ; GFX1064DAGISEL-NEXT:    s_addc_u32 s5, s5, s10
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064DAGISEL-NEXT:  ; %bb.2:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2224,7 +3161,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -2232,7 +3169,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064GISEL-NEXT:    s_add_u32 s4, s4, s9
 ; GFX1064GISEL-NEXT:    s_addc_u32 s5, s5, s10
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064GISEL-NEXT:  ; %bb.2:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2244,7 +3181,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
@@ -2252,7 +3189,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032DAGISEL-NEXT:    s_add_u32 s4, s4, s8
 ; GFX1032DAGISEL-NEXT:    s_addc_u32 s5, s5, s9
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032DAGISEL-NEXT:  ; %bb.2:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2264,7 +3201,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
@@ -2272,7 +3209,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032GISEL-NEXT:    s_add_u32 s4, s4, s8
 ; GFX1032GISEL-NEXT:    s_addc_u32 s5, s5, s9
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032GISEL-NEXT:  ; %bb.2:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2284,7 +3221,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s4, s[2:3]
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s5, v2, s4
@@ -2293,7 +3230,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_add_u32 s0, s0, s5
 ; GFX1164DAGISEL-NEXT:    s_addc_u32 s1, s1, s6
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164DAGISEL-NEXT:  ; %bb.2:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -2305,7 +3242,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s4, s[2:3]
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s5, v2, s4
@@ -2314,7 +3251,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_add_u32 s0, s0, s5
 ; GFX1164GISEL-NEXT:    s_addc_u32 s1, s1, s6
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164GISEL-NEXT:  ; %bb.2:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -2326,7 +3263,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -2335,7 +3272,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_add_u32 s0, s0, s4
 ; GFX1132DAGISEL-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -2346,7 +3283,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -2355,7 +3292,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_add_u32 s0, s0, s4
 ; GFX1132GISEL-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132GISEL-NEXT:  ; %bb.2:
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -2370,7 +3307,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX12DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
@@ -2380,7 +3317,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX12DAGISEL-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX12DAGISEL-NEXT:  ; %bb.2:
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
@@ -2401,7 +3338,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s7, s[6:7]
@@ -2410,7 +3347,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8DAGISEL-NEXT:    s_mul_hi_u32 s2, s2, s7
 ; GFX8DAGISEL-NEXT:    s_mul_i32 s3, s3, s7
 ; GFX8DAGISEL-NEXT:    s_add_u32 s7, s2, s3
-; GFX8DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX8DAGISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -2439,7 +3376,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s7, s[6:7]
@@ -2448,10 +3385,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    s_mul_hi_u32 s2, s2, s7
 ; GFX8GISEL-NEXT:    s_mul_i32 s3, s3, s7
 ; GFX8GISEL-NEXT:    s_add_u32 s7, s2, s3
-; GFX8GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
@@ -2461,7 +3398,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    s_mul_hi_u32 s4, s4, s7
 ; GFX8GISEL-NEXT:    s_mul_i32 s5, s5, s7
 ; GFX8GISEL-NEXT:    s_add_u32 s7, s4, s5
-; GFX8GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX8GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -2478,7 +3415,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s5, s[4:5]
@@ -2487,7 +3424,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9DAGISEL-NEXT:    s_mul_hi_u32 s2, s2, s5
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s3, s3, s5
 ; GFX9DAGISEL-NEXT:    s_add_u32 s5, s2, s3
-; GFX9DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX9DAGISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
@@ -2515,7 +3452,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s7, s[6:7]
@@ -2524,10 +3461,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    s_mul_hi_u32 s2, s2, s7
 ; GFX9GISEL-NEXT:    s_mul_i32 s3, s3, s7
 ; GFX9GISEL-NEXT:    s_add_u32 s7, s2, s3
-; GFX9GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
@@ -2537,7 +3474,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    s_mul_hi_u32 s5, s8, s4
 ; GFX9GISEL-NEXT:    s_mul_i32 s4, s9, s4
 ; GFX9GISEL-NEXT:    s_add_u32 s7, s5, s4
-; GFX9GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX9GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -2554,7 +3491,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr8_sgpr9
 ; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
@@ -2563,7 +3500,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064DAGISEL-NEXT:    s_mul_i32 s3, s3, s8
 ; GFX1064DAGISEL-NEXT:    s_mul_i32 s8, s2, s8
 ; GFX1064DAGISEL-NEXT:    s_add_u32 s9, s9, s3
-; GFX1064DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1064DAGISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[4:5]
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s8
@@ -2591,7 +3528,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -2600,10 +3537,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    s_mul_i32 s3, s3, s6
 ; GFX1064GISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX1064GISEL-NEXT:    s_add_u32 s7, s7, s3
-; GFX1064GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
@@ -2613,7 +3550,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    s_mul_i32 s7, s7, s4
 ; GFX1064GISEL-NEXT:    s_mul_i32 s6, s6, s4
 ; GFX1064GISEL-NEXT:    s_add_u32 s7, s5, s7
-; GFX1064GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1064GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -2630,7 +3567,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
@@ -2639,7 +3576,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032DAGISEL-NEXT:    s_mul_i32 s3, s3, s4
 ; GFX1032DAGISEL-NEXT:    s_mul_i32 s4, s2, s4
 ; GFX1032DAGISEL-NEXT:    s_add_u32 s5, s5, s3
-; GFX1032DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1032DAGISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
@@ -2667,7 +3604,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s6, s6
@@ -2676,10 +3613,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    s_mul_i32 s3, s3, s6
 ; GFX1032GISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX1032GISEL-NEXT:    s_add_u32 s7, s7, s3
-; GFX1032GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
@@ -2689,7 +3626,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    s_mul_i32 s5, s7, s3
 ; GFX1032GISEL-NEXT:    s_mul_i32 s6, s6, s3
 ; GFX1032GISEL-NEXT:    s_add_u32 s7, s4, s5
-; GFX1032GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1032GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -2708,7 +3645,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -2718,7 +3655,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164DAGISEL-NEXT:    s_mul_i32 s3, s3, s8
 ; GFX1164DAGISEL-NEXT:    s_mul_i32 s8, s2, s8
 ; GFX1164DAGISEL-NEXT:    s_add_u32 s9, s9, s3
-; GFX1164DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1164DAGISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[6:7]
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s8
@@ -2750,7 +3687,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -2760,10 +3697,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_mul_i32 s3, s3, s6
 ; GFX1164GISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX1164GISEL-NEXT:    s_add_u32 s7, s7, s3
-; GFX1164GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
@@ -2774,7 +3711,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_mul_i32 s5, s5, s6
 ; GFX1164GISEL-NEXT:    s_mul_i32 s6, s4, s6
 ; GFX1164GISEL-NEXT:    s_add_u32 s7, s7, s5
-; GFX1164GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1164GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -2793,7 +3730,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -2803,7 +3740,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132DAGISEL-NEXT:    s_mul_i32 s3, s3, s6
 ; GFX1132DAGISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX1132DAGISEL-NEXT:    s_add_u32 s7, s7, s3
-; GFX1132DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1132DAGISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
@@ -2833,7 +3770,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -2843,10 +3780,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_mul_i32 s3, s3, s6
 ; GFX1132GISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX1132GISEL-NEXT:    s_add_u32 s7, s7, s3
-; GFX1132GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
@@ -2857,7 +3794,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_mul_i32 s5, s5, s3
 ; GFX1132GISEL-NEXT:    s_mul_i32 s6, s4, s3
 ; GFX1132GISEL-NEXT:    s_add_u32 s7, s7, s5
-; GFX1132GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1132GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
@@ -2875,7 +3812,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX12DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX12DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX12DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -2885,7 +3822,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX12DAGISEL-NEXT:    s_mul_i32 s3, s3, s6
 ; GFX12DAGISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX12DAGISEL-NEXT:    s_add_co_u32 s7, s7, s3
-; GFX12DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX12DAGISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
index d3341eb0f8099..4d83751d4454d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
@@ -1054,6 +1054,973 @@ entry:
   ret void
 }
 
+define void @divergent_value_dpp_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    s_sub_u32 s4, 0, s6
+; GFX8DAGISEL-NEXT:    s_subb_u32 s5, 0, s7
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_dpp_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX8GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    s_sub_u32 s4, 0, s6
+; GFX8GISEL-NEXT:    s_subb_u32 s5, 0, s7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_sub_u32 s4, 0, s6
+; GFX9DAGISEL-NEXT:    s_subb_u32 s5, 0, s7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_dpp_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX9GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    s_sub_u32 s4, 0, s6
+; GFX9GISEL-NEXT:    s_subb_u32 s5, 0, s7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1064DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v8, 4, v8
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1064DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v6, v8, v4
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v7, v8, v5
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1064DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_sub_u32 s4, 0, s6
+; GFX1064DAGISEL-NEXT:    s_subb_u32 s5, 0, s7
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Reload
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1064GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v8, 4, v8
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1064GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT:    ds_permute_b32 v6, v8, v4
+; GFX1064GISEL-NEXT:    ds_permute_b32 v7, v8, v5
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1064GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX1064GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT:    s_sub_u32 s4, 0, s6
+; GFX1064GISEL-NEXT:    s_subb_u32 s5, 0, s7
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Reload
+; GFX1064GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1064GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064GISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s4
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032DAGISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032DAGISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032DAGISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032DAGISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1032DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s5, v4, 31
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v5, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_sub_u32 s4, 0, s5
+; GFX1032DAGISEL-NEXT:    s_subb_u32 s5, 0, s6
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s4
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1032GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032GISEL-NEXT:    v_readlane_b32 s5, v4, 31
+; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v5, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT:    s_sub_u32 s4, 0, s5
+; GFX1032GISEL-NEXT:    s_subb_u32 s5, 0, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1032GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1032GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Spill
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v4, s32
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:4
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:8
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v7, s32 offset:12
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v8, s32 offset:16
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1164DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v8, 4, v8
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT:    s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT:    s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT:    s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT:    s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v6, v8, v4
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v7, v8, v5
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164DAGISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT:    s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s2, v4, 63
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s3, v5, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_sub_u32 s0, 0, s2
+; GFX1164DAGISEL-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-NEXT:    s_subb_u32 s1, 0, s3
+; GFX1164DAGISEL-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1164DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Reload
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v4, off, s32
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v8, off, s32 offset:16
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Spill
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v4, s32
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:4
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:8
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v7, s32 offset:12
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v8, s32 offset:16
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1164GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v8, 4, v8
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT:    s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT:    s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT:    s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT:    s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT:    ds_permute_b32 v6, v8, v4
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT:    ds_permute_b32 v7, v8, v5
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164GISEL-NEXT:    v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT:    s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s2, v4, 63
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT:    v_readlane_b32 s3, v5, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_sub_u32 s0, 0, s2
+; GFX1164GISEL-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164GISEL-NEXT:    s_subb_u32 s1, 0, s3
+; GFX1164GISEL-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1164GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Reload
+; GFX1164GISEL-NEXT:    scratch_load_b32 v4, off, s32
+; GFX1164GISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX1164GISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX1164GISEL-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX1164GISEL-NEXT:    scratch_load_b32 v8, off, s32 offset:16
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
+; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v4, s32
+; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:4
+; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:8
+; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v7, s32 offset:12
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s0
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s0
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1132DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s1, v4, 31
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s2, v5, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_sub_u32 s0, 0, s1
+; GFX1132DAGISEL-NEXT:    s_subb_u32 s1, 0, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1132DAGISEL-NEXT:    scratch_load_b32 v4, off, s32
+; GFX1132DAGISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX1132DAGISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX1132DAGISEL-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
+; GFX1132GISEL-NEXT:    scratch_store_b32 off, v4, s32
+; GFX1132GISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:4
+; GFX1132GISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:8
+; GFX1132GISEL-NEXT:    scratch_store_b32 off, v7, s32 offset:12
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s0
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s0
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1132GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s1, v4, 31
+; GFX1132GISEL-NEXT:    v_readlane_b32 s2, v5, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_sub_u32 s0, 0, s1
+; GFX1132GISEL-NEXT:    s_subb_u32 s1, 0, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1132GISEL-NEXT:    scratch_load_b32 v4, off, s32
+; GFX1132GISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX1132GISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX1132GISEL-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX12DAGISEL:       ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
+; GFX12DAGISEL-NEXT:    scratch_store_b32 off, v4, s32
+; GFX12DAGISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:4
+; GFX12DAGISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:8
+; GFX12DAGISEL-NEXT:    scratch_store_b32 off, v7, s32 offset:12
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX12DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s2
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX12DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX12DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX12DAGISEL-NEXT:    s_wait_dscnt 0x1
+; GFX12DAGISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX12DAGISEL-NEXT:    s_wait_dscnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12DAGISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT:    v_readlane_b32 s0, v4, 31
+; GFX12DAGISEL-NEXT:    v_readlane_b32 s1, v5, 31
+; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12DAGISEL-NEXT:    s_sub_nc_u64 s[0:1], 0, s[0:1]
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX12DAGISEL-NEXT:    scratch_load_b32 v4, off, s32
+; GFX12DAGISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX12DAGISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX12DAGISEL-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.sub.i64(i64 %in, i32 2)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) {
 ; GFX8DAGISEL-LABEL: default_stratergy:
 ; GFX8DAGISEL:       ; %bb.0: ; %entry
@@ -1435,7 +2402,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1444,24 +2411,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
-; GFX8DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX8DAGISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8DAGISEL-NEXT:    s_sub_i32 s6, s6, s8
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX8DAGISEL-NEXT:  ; %bb.5:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1476,7 +2443,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1485,20 +2452,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX8GISEL-NEXT:    s_mul_i32 s6, s3, s2
-; GFX8GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8GISEL-NEXT:    s_sub_i32 s6, s6, s8
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX8GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX8GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
@@ -1514,7 +2481,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1523,24 +2490,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
-; GFX9DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX9DAGISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9DAGISEL-NEXT:    s_sub_i32 s6, s6, s8
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX9DAGISEL-NEXT:  ; %bb.5:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1554,7 +2521,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1563,20 +2530,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX9GISEL-NEXT:    s_mul_i32 s6, s3, s2
-; GFX9GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9GISEL-NEXT:    s_sub_i32 s6, s6, s8
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX9GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX9GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -1591,7 +2558,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1600,24 +2567,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
-; GFX1064DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1064DAGISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064DAGISEL-NEXT:    s_sub_i32 s6, s6, s8
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX1064DAGISEL-NEXT:  ; %bb.5:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX1064DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1631,7 +2598,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1640,20 +2607,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX1064GISEL-NEXT:    s_mul_i32 s6, s3, s2
-; GFX1064GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064GISEL-NEXT:    s_sub_i32 s6, s6, s8
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX1064GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1064GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -1668,7 +2635,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr1
 ; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032DAGISEL-NEXT:    s_load_dword s1, s[4:5], 0x2c
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -1677,24 +2644,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_sub_i32 s1, 0, s1
 ; GFX1032DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
-; GFX1032DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1032DAGISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1032DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032DAGISEL-NEXT:    s_sub_i32 s1, s1, s6
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX1032DAGISEL-NEXT:  ; %bb.5:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX1032DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1708,7 +2675,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr0
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -1717,20 +2684,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_sub_i32 s0, 0, s0
 ; GFX1032GISEL-NEXT:    s_mul_i32 s0, s0, s2
-; GFX1032GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX1032GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032GISEL-NEXT:    s_sub_i32 s0, s0, s6
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX1032GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1032GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -1747,7 +2714,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164DAGISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1757,25 +2724,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
-; GFX1164DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1164DAGISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1164DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164DAGISEL-NEXT:    s_sub_i32 s6, s6, s8
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX1164DAGISEL-NEXT:  ; %bb.5:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX1164DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1791,7 +2758,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1801,21 +2768,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX1164GISEL-NEXT:    s_mul_i32 s6, s3, s2
-; GFX1164GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1164GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164GISEL-NEXT:    s_sub_i32 s6, s6, s8
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX1164GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1164GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -1832,7 +2799,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132DAGISEL-NEXT:    s_load_b32 s1, s[4:5], 0x2c
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -1842,25 +2809,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_sub_i32 s1, 0, s1
 ; GFX1132DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
-; GFX1132DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1132DAGISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1132DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_sub_i32 s1, s1, s6
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX1132DAGISEL-NEXT:  ; %bb.5:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX1132DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1876,7 +2843,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -1886,21 +2853,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_sub_i32 s0, 0, s0
 ; GFX1132GISEL-NEXT:    s_mul_i32 s0, s0, s2
-; GFX1132GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX1132GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_sub_i32 s0, s0, s6
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX1132GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1132GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
@@ -1916,7 +2883,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX12DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX12DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX12DAGISEL-NEXT:    s_load_b32 s1, s[4:5], 0x2c
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -1926,15 +2893,15 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12DAGISEL-NEXT:    s_sub_co_i32 s1, 0, s1
 ; GFX12DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
-; GFX12DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX12DAGISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX12DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX12DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
@@ -1942,10 +2909,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX12DAGISEL-NEXT:    s_sub_co_i32 s1, s1, s6
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX12DAGISEL-NEXT:  ; %bb.5:
 ; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX12DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX12DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2250,7 +3217,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -2258,7 +3225,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s8
 ; GFX8DAGISEL-NEXT:    s_subb_u32 s5, s5, s10
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX8DAGISEL-NEXT:  ; %bb.2:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2271,7 +3238,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX8GISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -2279,7 +3246,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s8
 ; GFX8GISEL-NEXT:    s_subb_u32 s5, s5, s10
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX8GISEL-NEXT:  ; %bb.2:
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2292,7 +3259,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -2300,7 +3267,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s8
 ; GFX9DAGISEL-NEXT:    s_subb_u32 s5, s5, s10
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9DAGISEL-NEXT:  ; %bb.2:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2313,7 +3280,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX9GISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -2321,7 +3288,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s8
 ; GFX9GISEL-NEXT:    s_subb_u32 s5, s5, s10
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9GISEL-NEXT:  ; %bb.2:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2334,7 +3301,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -2342,7 +3309,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064DAGISEL-NEXT:    s_sub_u32 s4, s4, s9
 ; GFX1064DAGISEL-NEXT:    s_subb_u32 s5, s5, s10
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064DAGISEL-NEXT:  ; %bb.2:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2354,7 +3321,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -2362,7 +3329,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064GISEL-NEXT:    s_sub_u32 s4, s4, s9
 ; GFX1064GISEL-NEXT:    s_subb_u32 s5, s5, s10
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064GISEL-NEXT:  ; %bb.2:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2374,7 +3341,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
@@ -2382,7 +3349,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032DAGISEL-NEXT:    s_sub_u32 s4, s4, s8
 ; GFX1032DAGISEL-NEXT:    s_subb_u32 s5, s5, s9
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032DAGISEL-NEXT:  ; %bb.2:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2394,7 +3361,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
@@ -2402,7 +3369,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032GISEL-NEXT:    s_sub_u32 s4, s4, s8
 ; GFX1032GISEL-NEXT:    s_subb_u32 s5, s5, s9
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032GISEL-NEXT:  ; %bb.2:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2414,7 +3381,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s4, s[2:3]
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s5, v2, s4
@@ -2423,7 +3390,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_sub_u32 s0, s0, s5
 ; GFX1164DAGISEL-NEXT:    s_subb_u32 s1, s1, s6
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164DAGISEL-NEXT:  ; %bb.2:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -2435,7 +3402,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s4, s[2:3]
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s5, v2, s4
@@ -2444,7 +3411,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_sub_u32 s0, s0, s5
 ; GFX1164GISEL-NEXT:    s_subb_u32 s1, s1, s6
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164GISEL-NEXT:  ; %bb.2:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -2456,7 +3423,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -2465,7 +3432,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_sub_u32 s0, s0, s4
 ; GFX1132DAGISEL-NEXT:    s_subb_u32 s1, s1, s5
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -2476,7 +3443,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -2485,7 +3452,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_sub_u32 s0, s0, s4
 ; GFX1132GISEL-NEXT:    s_subb_u32 s1, s1, s5
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132GISEL-NEXT:  ; %bb.2:
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -2500,7 +3467,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX12DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
@@ -2510,7 +3477,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX12DAGISEL-NEXT:    s_sub_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX12DAGISEL-NEXT:  ; %bb.2:
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
@@ -2531,7 +3498,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -2544,13 +3511,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8DAGISEL-NEXT:    s_mul_i32 s3, s3, s7
 ; GFX8DAGISEL-NEXT:    s_add_u32 s2, s2, s3
 ; GFX8DAGISEL-NEXT:    s_add_u32 s7, s2, s10
-; GFX8DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX8DAGISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -2564,7 +3531,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8DAGISEL-NEXT:    s_add_u32 s7, s4, s8
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8DAGISEL-NEXT:  .LBB8_4: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -2578,7 +3545,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -2591,10 +3558,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    s_mul_i32 s3, s3, s7
 ; GFX8GISEL-NEXT:    s_add_u32 s2, s2, s3
 ; GFX8GISEL-NEXT:    s_add_u32 s7, s2, s10
-; GFX8GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
@@ -2608,7 +3575,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    s_mul_i32 s5, s5, s7
 ; GFX8GISEL-NEXT:    s_add_u32 s4, s4, s5
 ; GFX8GISEL-NEXT:    s_add_u32 s7, s4, s8
-; GFX8GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX8GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -2625,7 +3592,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
@@ -2638,13 +3605,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s3, s3, s5
 ; GFX9DAGISEL-NEXT:    s_add_u32 s2, s2, s3
 ; GFX9DAGISEL-NEXT:    s_add_u32 s5, s2, s10
-; GFX9DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX9DAGISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
@@ -2658,7 +3625,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9DAGISEL-NEXT:    s_add_u32 s5, s5, s8
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9DAGISEL-NEXT:  .LBB8_4: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -2671,7 +3638,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -2684,10 +3651,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    s_mul_i32 s3, s3, s7
 ; GFX9GISEL-NEXT:    s_add_u32 s2, s2, s3
 ; GFX9GISEL-NEXT:    s_add_u32 s7, s2, s10
-; GFX9GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
@@ -2701,7 +3668,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    s_mul_i32 s5, s8, s5
 ; GFX9GISEL-NEXT:    s_add_u32 s4, s7, s4
 ; GFX9GISEL-NEXT:    s_add_u32 s7, s4, s5
-; GFX9GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX9GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -2718,7 +3685,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr8_sgpr9
 ; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
@@ -2731,7 +3698,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064DAGISEL-NEXT:    s_add_u32 s3, s10, s3
 ; GFX1064DAGISEL-NEXT:    s_mul_i32 s8, s2, s8
 ; GFX1064DAGISEL-NEXT:    s_add_u32 s9, s3, s9
-; GFX1064DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1064DAGISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[4:5]
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s8
@@ -2763,7 +3730,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -2776,10 +3743,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    s_add_u32 s3, s10, s3
 ; GFX1064GISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX1064GISEL-NEXT:    s_add_u32 s7, s3, s7
-; GFX1064GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
@@ -2793,7 +3760,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    s_add_u32 s7, s8, s7
 ; GFX1064GISEL-NEXT:    s_mul_i32 s6, s6, s4
 ; GFX1064GISEL-NEXT:    s_add_u32 s7, s7, s5
-; GFX1064GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1064GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -2810,7 +3777,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
@@ -2823,7 +3790,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032DAGISEL-NEXT:    s_add_u32 s3, s9, s3
 ; GFX1032DAGISEL-NEXT:    s_mul_i32 s4, s2, s4
 ; GFX1032DAGISEL-NEXT:    s_add_u32 s5, s3, s5
-; GFX1032DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1032DAGISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
@@ -2855,7 +3822,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s6, s6
@@ -2868,10 +3835,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    s_add_u32 s3, s9, s3
 ; GFX1032GISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX1032GISEL-NEXT:    s_add_u32 s7, s3, s7
-; GFX1032GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
@@ -2885,7 +3852,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    s_add_u32 s5, s5, s7
 ; GFX1032GISEL-NEXT:    s_mul_i32 s6, s6, s3
 ; GFX1032GISEL-NEXT:    s_add_u32 s7, s5, s4
-; GFX1032GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1032GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -2904,7 +3871,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2919,7 +3886,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164DAGISEL-NEXT:    s_add_u32 s3, s10, s3
 ; GFX1164DAGISEL-NEXT:    s_mul_i32 s8, s2, s8
 ; GFX1164DAGISEL-NEXT:    s_add_u32 s9, s3, s9
-; GFX1164DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1164DAGISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[6:7]
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s8
@@ -2955,7 +3922,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2970,10 +3937,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_add_u32 s3, s10, s3
 ; GFX1164GISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX1164GISEL-NEXT:    s_add_u32 s7, s3, s7
-; GFX1164GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
@@ -2989,7 +3956,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_add_u32 s5, s8, s5
 ; GFX1164GISEL-NEXT:    s_mul_i32 s6, s4, s6
 ; GFX1164GISEL-NEXT:    s_add_u32 s7, s5, s7
-; GFX1164GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1164GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -3008,7 +3975,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3023,7 +3990,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132DAGISEL-NEXT:    s_add_u32 s3, s9, s3
 ; GFX1132DAGISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX1132DAGISEL-NEXT:    s_add_u32 s7, s3, s7
-; GFX1132DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1132DAGISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
@@ -3058,7 +4025,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3073,10 +4040,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_add_u32 s3, s9, s3
 ; GFX1132GISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX1132GISEL-NEXT:    s_add_u32 s7, s3, s7
-; GFX1132GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
@@ -3092,7 +4059,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_add_u32 s5, s7, s5
 ; GFX1132GISEL-NEXT:    s_mul_i32 s6, s4, s3
 ; GFX1132GISEL-NEXT:    s_add_u32 s7, s5, s8
-; GFX1132GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1132GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
@@ -3110,7 +4077,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX12DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX12DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX12DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3125,7 +4092,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX12DAGISEL-NEXT:    s_add_co_u32 s3, s9, s3
 ; GFX12DAGISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX12DAGISEL-NEXT:    s_add_co_u32 s7, s3, s7
-; GFX12DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX12DAGISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7



More information about the llvm-branch-commits mailing list