[llvm-branch-commits] [llvm] [AMDGPU] DPP wave reduction for double types - 1 (PR #189390)

via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Mon Mar 30 07:10:12 PDT 2026


https://github.com/easyonaadit created https://github.com/llvm/llvm-project/pull/189390

Supported Ops: `fmin` and `fmax`

>From 370aa9e4f7d3020345c7bd2b0e68cce4f82e4ad8 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Mon, 30 Mar 2026 18:46:47 +0530
Subject: [PATCH] [AMDGPU] DPP wave reduction for double types - 1

Supported Ops: `fmin` and `fmax`
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   40 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll | 1346 ++++++++++++++---
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.fmin.ll | 1346 ++++++++++++++---
 3 files changed, 2251 insertions(+), 481 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index dc7f0906159c4..dcc342638c5c1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5741,6 +5741,10 @@ getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST) {
   case AMDGPU::S_AND_B64:
   case AMDGPU::S_OR_B64:
   case AMDGPU::S_XOR_B64:
+  case AMDGPU::V_MIN_NUM_F64_e64:
+  case AMDGPU::V_MIN_F64_e64:
+  case AMDGPU::V_MAX_NUM_F64_e64:
+  case AMDGPU::V_MAX_F64_e64:
     DPPOpc = AMDGPU::V_MOV_B64_DPP_PSEUDO;
     break;
   default:
@@ -6308,10 +6312,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
                                       unsigned DPPCtrl) {
         auto DPPInstr =
             BuildMI(*CurrBB, MI, DL, TII->get(DPPOpc), Dst).addReg(Src); // old
-        if (isFPOp)
+        if (isFPOp && !NeedsMovDPP)
           DPPInstr.addImm(SISrcMods::NONE); // src0 modifier
         DPPInstr.addReg(Src);               // src0
-        if (isFPOp)
+        if (isFPOp && !NeedsMovDPP)
           DPPInstr.addImm(SISrcMods::NONE); // src1 modifier
         if (!NeedsMovDPP)
           DPPInstr.addReg(Src); // src1
@@ -6378,17 +6382,27 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
                           CarryReg);
           BuildRegSequence(*CurrBB, MI, ReturnReg, ResLo, ResHi);
         } else {
-          Register CmpMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
-          BuildMI(*CurrBB, MI, DL, TII->get(Opc), CmpMaskReg)
-              .addReg(Src0)  // src0
-              .addReg(Src1); // src1
-          LastBcastInstr =
-              BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B64_PSEUDO),
-                      ReturnReg)
-                  .addReg(Src1)        // src0
-                  .addReg(Src0)        // src1
-                  .addReg(CmpMaskReg); // src2
-          CurrBB = Expand64BitV_CND_MASK(*LastBcastInstr, CurrBB);
+          if (isFPOp) {
+            BuildMI(*CurrBB, MI, DL, TII->get(Opc), ReturnReg)
+                .addImm(SISrcMods::NONE) // src0 modifiers
+                .addReg(Src0)
+                .addImm(SISrcMods::NONE) // src1 modifiers
+                .addReg(Src1)
+                .addImm(SISrcMods::NONE)  // clamp
+                .addImm(SISrcMods::NONE); // omod
+          } else {
+            Register CmpMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
+            BuildMI(*CurrBB, MI, DL, TII->get(Opc), CmpMaskReg)
+                .addReg(Src0)  // src0
+                .addReg(Src1); // src1
+            LastBcastInstr =
+                BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B64_PSEUDO),
+                        ReturnReg)
+                    .addReg(Src1)        // src0
+                    .addReg(Src0)        // src1
+                    .addReg(CmpMaskReg); // src2
+            CurrBB = Expand64BitV_CND_MASK(*LastBcastInstr, CurrBB);
+          }
         }
         return ReturnReg;
       };
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll
index fbe099b46dc21..1e151ccf20c36 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll
@@ -858,6 +858,884 @@ entry:
   ret void
 }
 
+define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) {
+; GFX8DAGISEL-LABEL: divergent_value_double_dpp:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v2, s[4:5]
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v8, v6
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_max_f64 v[4:5], v[5:6], v[7:8]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_double_dpp:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v2, s[4:5]
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v8, v6
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_max_f64 v[4:5], v[5:6], v[7:8]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX8GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX8GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_double_dpp:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v2, s[4:5]
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_max_f64 v[4:5], v[5:6], v[7:8]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_double_dpp:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v2, s[4:5]
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_max_f64 v[4:5], v[5:6], v[7:8]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX9GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX9GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_double_dpp:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v3, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v6, -1, v8
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v6, 32, v6
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v6, 4, v6
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v7, v6, v4
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v8, v6, v5
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[7:8]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:20
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:24
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:28
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_double_dpp:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v3, s[4:5]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v6, -1, v8
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v6, 32, v6
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v6, 4, v6
+; GFX1064GISEL-NEXT:    ds_permute_b32 v7, v6, v4
+; GFX1064GISEL-NEXT:    ds_permute_b32 v8, v6, v5
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[7:8]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX1064GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
+; GFX1064GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1064GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064GISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:20
+; GFX1064GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:24
+; GFX1064GISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:28
+; GFX1064GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_double_dpp:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s6
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v3, s6
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s4, v4, 31
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s5, v5, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_double_dpp:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s6
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v3, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1032GISEL-NEXT:    v_readlane_b32 s4, v4, 31
+; GFX1032GISEL-NEXT:    v_readlane_b32 s5, v5, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1032GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1032GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_double_dpp:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_clause 0x3 ; 28-byte Folded Spill
+; GFX1164DAGISEL-NEXT:    scratch_store_b64 off, v[4:5], s32
+; GFX1164DAGISEL-NEXT:    scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:16
+; GFX1164DAGISEL-NEXT:    scratch_store_b64 off, v[7:8], s32 offset:20
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v3, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v6, -1, 0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v6, -1, v6
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v6, 32, v6
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v6, 4, v6
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v7, v6, v4
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v8, v6, v5
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[7:8]
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s2, v4, 63
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s3, v5, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1164DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-NEXT:    s_clause 0x3 ; 28-byte Folded Reload
+; GFX1164DAGISEL-NEXT:    scratch_load_b64 v[4:5], off, s32
+; GFX1164DAGISEL-NEXT:    scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:16
+; GFX1164DAGISEL-NEXT:    scratch_load_b64 v[7:8], off, s32 offset:20
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_double_dpp:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_clause 0x3 ; 28-byte Folded Spill
+; GFX1164GISEL-NEXT:    scratch_store_b64 off, v[4:5], s32
+; GFX1164GISEL-NEXT:    scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:16
+; GFX1164GISEL-NEXT:    scratch_store_b64 off, v[7:8], s32 offset:20
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v3, s[0:1]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v6, -1, 0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v6, -1, v6
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v6, 32, v6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v6, 4, v6
+; GFX1164GISEL-NEXT:    ds_permute_b32 v7, v6, v4
+; GFX1164GISEL-NEXT:    ds_permute_b32 v8, v6, v5
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[7:8]
+; GFX1164GISEL-NEXT:    v_readlane_b32 s2, v4, 63
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s3, v5, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1164GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164GISEL-NEXT:    s_clause 0x3 ; 28-byte Folded Reload
+; GFX1164GISEL-NEXT:    scratch_load_b64 v[4:5], off, s32
+; GFX1164GISEL-NEXT:    scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1164GISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:16
+; GFX1164GISEL-NEXT:    scratch_load_b64 v[7:8], off, s32 offset:20
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_double_dpp:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_clause 0x1 ; 16-byte Folded Spill
+; GFX1132DAGISEL-NEXT:    scratch_store_b64 off, v[4:5], s32
+; GFX1132DAGISEL-NEXT:    scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v3, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s0, v4, 31
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s1, v5, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_clause 0x1 ; 16-byte Folded Reload
+; GFX1132DAGISEL-NEXT:    scratch_load_b64 v[4:5], off, s32
+; GFX1132DAGISEL-NEXT:    scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_double_dpp:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_clause 0x1 ; 16-byte Folded Spill
+; GFX1132GISEL-NEXT:    scratch_store_b64 off, v[4:5], s32
+; GFX1132GISEL-NEXT:    scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v3, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132GISEL-NEXT:    v_readlane_b32 s0, v4, 31
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s1, v5, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_clause 0x1 ; 16-byte Folded Reload
+; GFX1132GISEL-NEXT:    scratch_load_b64 v[4:5], off, s32
+; GFX1132GISEL-NEXT:    scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12DAGISEL-LABEL: divergent_value_double_dpp:
+; GFX12DAGISEL:       ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12DAGISEL-NEXT:    s_clause 0x1 ; 16-byte Folded Spill
+; GFX12DAGISEL-NEXT:    scratch_store_b64 off, v[4:5], s32
+; GFX12DAGISEL-NEXT:    scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX12DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v3, s2
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[6:7]
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[6:7]
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[6:7]
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[6:7]
+; GFX12DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX12DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX12DAGISEL-NEXT:    s_wait_dscnt 0x0
+; GFX12DAGISEL-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[6:7]
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT:    v_readlane_b32 s0, v4, 31
+; GFX12DAGISEL-NEXT:    v_readlane_b32 s1, v5, 31
+; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12DAGISEL-NEXT:    s_clause 0x1 ; 16-byte Folded Reload
+; GFX12DAGISEL-NEXT:    scratch_load_b64 v[4:5], off, s32
+; GFX12DAGISEL-NEXT:    scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call double @llvm.amdgcn.wave.reduce.fmax(double %in, i32 2)
+  store double %result, ptr addrspace(1) %out
+  ret void
+}
+
 define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX8DAGISEL-LABEL: divergent_cfg_float:
 ; GFX8DAGISEL:       ; %bb.0: ; %entry
@@ -867,11 +1745,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB3_4
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB4_4
 ; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX8DAGISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s10, v2, s9
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s10
@@ -879,17 +1757,17 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX8DAGISEL-NEXT:    v_max_f32_e32 v3, s8, v3
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s8, v3
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_2
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_2
 ; GFX8DAGISEL-NEXT:  ; %bb.3:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX8DAGISEL-NEXT:  .LBB3_4: ; %Flow
+; GFX8DAGISEL-NEXT:  .LBB4_4: ; %Flow
 ; GFX8DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB3_8
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB4_8
 ; GFX8DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX8DAGISEL-NEXT:  .LBB3_6: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB4_6: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s10, v3, s9
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s10
@@ -897,10 +1775,10 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX8DAGISEL-NEXT:    v_max_f32_e32 v2, s8, v2
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_6
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_6
 ; GFX8DAGISEL-NEXT:  ; %bb.7:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
-; GFX8DAGISEL-NEXT:  .LBB3_8: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB4_8: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v4
 ; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -914,12 +1792,12 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr8
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB3_3
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
 ; GFX8GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX8GISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s10, v2, s9
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v4, s10
@@ -927,14 +1805,14 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX8GISEL-NEXT:    v_max_f32_e32 v4, s8, v4
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX8GISEL-NEXT:    v_readfirstlane_b32 s8, v4
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB3_2
-; GFX8GISEL-NEXT:  .LBB3_3: ; %Flow
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB4_2
+; GFX8GISEL-NEXT:  .LBB4_3: ; %Flow
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB3_6
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB4_6
 ; GFX8GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX8GISEL-NEXT:  .LBB3_5: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB4_5: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s10, v3, s9
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s10
@@ -942,8 +1820,8 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX8GISEL-NEXT:    v_max_f32_e32 v2, s8, v2
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX8GISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB3_5
-; GFX8GISEL-NEXT:  .LBB3_6: ; %endif
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB4_5
+; GFX8GISEL-NEXT:  .LBB4_6: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
@@ -958,11 +1836,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB3_4
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB4_4
 ; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX9DAGISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s10, v2, s9
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s10
@@ -970,17 +1848,17 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX9DAGISEL-NEXT:    v_max_f32_e32 v3, s8, v3
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s8, v3
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_2
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_2
 ; GFX9DAGISEL-NEXT:  ; %bb.3:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX9DAGISEL-NEXT:  .LBB3_4: ; %Flow
+; GFX9DAGISEL-NEXT:  .LBB4_4: ; %Flow
 ; GFX9DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB3_8
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB4_8
 ; GFX9DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX9DAGISEL-NEXT:  .LBB3_6: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB4_6: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s10, v3, s9
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s10
@@ -988,10 +1866,10 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX9DAGISEL-NEXT:    v_max_f32_e32 v2, s8, v2
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_6
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_6
 ; GFX9DAGISEL-NEXT:  ; %bb.7:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9DAGISEL-NEXT:  .LBB3_8: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB4_8: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9DAGISEL-NEXT:    global_store_dword v[0:1], v4, off
 ; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -1005,12 +1883,12 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr8
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB3_3
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
 ; GFX9GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX9GISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s10, v2, s9
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v4, s10
@@ -1018,14 +1896,14 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX9GISEL-NEXT:    v_max_f32_e32 v4, s8, v4
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX9GISEL-NEXT:    v_readfirstlane_b32 s8, v4
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB3_2
-; GFX9GISEL-NEXT:  .LBB3_3: ; %Flow
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB4_2
+; GFX9GISEL-NEXT:  .LBB4_3: ; %Flow
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB3_6
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB4_6
 ; GFX9GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX9GISEL-NEXT:  .LBB3_5: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB4_5: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s10, v3, s9
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s10
@@ -1033,8 +1911,8 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX9GISEL-NEXT:    v_max_f32_e32 v2, s8, v2
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX9GISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB3_5
-; GFX9GISEL-NEXT:  .LBB3_6: ; %endif
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB4_5
+; GFX9GISEL-NEXT:  .LBB4_6: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9GISEL-NEXT:    global_store_dword v[0:1], v2, off
@@ -1049,38 +1927,38 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB3_4
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB4_4
 ; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX1064DAGISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s10, v2, s9
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX1064DAGISEL-NEXT:    v_max_f32_e64 v3, s8, s10
 ; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s8, v3
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_2
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_2
 ; GFX1064DAGISEL-NEXT:  ; %bb.3:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1064DAGISEL-NEXT:  .LBB3_4: ; %Flow
+; GFX1064DAGISEL-NEXT:  .LBB4_4: ; %Flow
 ; GFX1064DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB3_8
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB4_8
 ; GFX1064DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX1064DAGISEL-NEXT:  .LBB3_6: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB4_6: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s10, v3, s9
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX1064DAGISEL-NEXT:    v_max_f32_e64 v2, s8, s10
 ; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_6
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_6
 ; GFX1064DAGISEL-NEXT:  ; %bb.7:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
-; GFX1064DAGISEL-NEXT:  .LBB3_8: ; %endif
+; GFX1064DAGISEL-NEXT:  .LBB4_8: ; %endif
 ; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064DAGISEL-NEXT:    global_store_dword v[0:1], v4, off
 ; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1093,11 +1971,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v4
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1064GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX1064GISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s10, v2, s9
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
@@ -1105,22 +1983,22 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1064GISEL-NEXT:    v_max_f32_e64 v3, s8, s10
 ; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s8, v3
 ; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB3_2
-; GFX1064GISEL-NEXT:  .LBB3_3: ; %Flow
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB4_2
+; GFX1064GISEL-NEXT:  .LBB4_3: ; %Flow
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB3_6
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB4_6
 ; GFX1064GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1064GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX1064GISEL-NEXT:  .LBB3_5: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB4_5: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s10, v3, s9
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX1064GISEL-NEXT:    v_max_f32_e64 v2, s8, s10
 ; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB3_5
-; GFX1064GISEL-NEXT:  .LBB3_6: ; %endif
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB4_5
+; GFX1064GISEL-NEXT:  .LBB4_6: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX1064GISEL-NEXT:    global_store_dword v[0:1], v2, off
@@ -1134,38 +2012,38 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 s4, exec_lo, s4
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB3_4
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB4_4
 ; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
-; GFX1032DAGISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s5
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s5, s7
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX1032DAGISEL-NEXT:    v_max_f32_e64 v3, s6, s8
 ; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_2
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_2
 ; GFX1032DAGISEL-NEXT:  ; %bb.3:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1032DAGISEL-NEXT:  .LBB3_4: ; %Flow
+; GFX1032DAGISEL-NEXT:  .LBB4_4: ; %Flow
 ; GFX1032DAGISEL-NEXT:    s_andn2_saveexec_b32 s4, s4
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB3_8
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB4_8
 ; GFX1032DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
-; GFX1032DAGISEL-NEXT:  .LBB3_6: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB4_6: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s5
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v3, s7
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s5, s7
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX1032DAGISEL-NEXT:    v_max_f32_e64 v2, s6, s8
 ; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_6
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_6
 ; GFX1032DAGISEL-NEXT:  ; %bb.7:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v4, s6
-; GFX1032DAGISEL-NEXT:  .LBB3_8: ; %endif
+; GFX1032DAGISEL-NEXT:  .LBB4_8: ; %endif
 ; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032DAGISEL-NEXT:    global_store_dword v[0:1], v4, off
 ; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1178,11 +2056,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v4
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s5, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s5, exec_lo, s5
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1032GISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
@@ -1190,22 +2068,22 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1032GISEL-NEXT:    v_max_f32_e64 v3, s4, s8
 ; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s4, v3
 ; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB3_2
-; GFX1032GISEL-NEXT:  .LBB3_3: ; %Flow
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_2
+; GFX1032GISEL-NEXT:  .LBB4_3: ; %Flow
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s5, s5
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB3_6
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB4_6
 ; GFX1032GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1032GISEL-NEXT:  .LBB3_5: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB4_5: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v3, s7
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX1032GISEL-NEXT:    v_max_f32_e64 v2, s4, s8
 ; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB3_5
-; GFX1032GISEL-NEXT:  .LBB3_6: ; %endif
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_5
+; GFX1032GISEL-NEXT:  .LBB4_6: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    global_store_dword v[0:1], v2, off
@@ -1220,11 +2098,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1164DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB3_4
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB4_4
 ; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1164DAGISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v2, s5
@@ -1233,17 +2111,17 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1164DAGISEL-NEXT:    v_max_f32_e64 v3, s4, s6
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_2
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_2
 ; GFX1164DAGISEL-NEXT:  ; %bb.3:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1164DAGISEL-NEXT:  .LBB3_4: ; %Flow
+; GFX1164DAGISEL-NEXT:  .LBB4_4: ; %Flow
 ; GFX1164DAGISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB3_8
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB4_8
 ; GFX1164DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1164DAGISEL-NEXT:  .LBB3_6: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB4_6: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v3, s5
@@ -1252,10 +2130,10 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1164DAGISEL-NEXT:    v_max_f32_e64 v2, s4, s6
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_6
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_6
 ; GFX1164DAGISEL-NEXT:  ; %bb.7:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX1164DAGISEL-NEXT:  .LBB3_8: ; %endif
+; GFX1164DAGISEL-NEXT:  .LBB4_8: ; %endif
 ; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1269,11 +2147,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v4
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1164GISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v2, s5
@@ -1283,14 +2161,14 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s4, v3
 ; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB3_2
-; GFX1164GISEL-NEXT:  .LBB3_3: ; %Flow
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB4_2
+; GFX1164GISEL-NEXT:  .LBB4_3: ; %Flow
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB3_6
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB4_6
 ; GFX1164GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1164GISEL-NEXT:  .LBB3_5: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB4_5: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v3, s5
@@ -1299,8 +2177,8 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1164GISEL-NEXT:    v_max_f32_e64 v2, s4, s6
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB3_5
-; GFX1164GISEL-NEXT:  .LBB3_6: ; %endif
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB4_5
+; GFX1164GISEL-NEXT:  .LBB4_6: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164GISEL-NEXT:    global_store_b32 v[0:1], v2, off
@@ -1315,11 +2193,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1132DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB3_4
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB4_4
 ; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
-; GFX1132DAGISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -1328,17 +2206,17 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1132DAGISEL-NEXT:    v_max_f32_e64 v3, s2, s4
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_2
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_2
 ; GFX1132DAGISEL-NEXT:  ; %bb.3:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1132DAGISEL-NEXT:  .LBB3_4: ; %Flow
+; GFX1132DAGISEL-NEXT:  .LBB4_4: ; %Flow
 ; GFX1132DAGISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB3_8
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB4_8
 ; GFX1132DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
-; GFX1132DAGISEL-NEXT:  .LBB3_6: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB4_6: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v3, s3
@@ -1347,10 +2225,10 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1132DAGISEL-NEXT:    v_max_f32_e64 v2, s2, s4
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_6
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_6
 ; GFX1132DAGISEL-NEXT:  ; %bb.7:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v4, s2
-; GFX1132DAGISEL-NEXT:  .LBB3_8: ; %endif
+; GFX1132DAGISEL-NEXT:  .LBB4_8: ; %endif
 ; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1364,11 +2242,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v4
 ; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0x7fc00000
-; GFX1132GISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -1378,14 +2256,14 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s0, v3
 ; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB3_2
-; GFX1132GISEL-NEXT:  .LBB3_3: ; %Flow
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB4_2
+; GFX1132GISEL-NEXT:  .LBB4_3: ; %Flow
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB3_6
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB4_6
 ; GFX1132GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0x7fc00000
-; GFX1132GISEL-NEXT:  .LBB3_5: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB4_5: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v3, s3
@@ -1394,8 +2272,8 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1132GISEL-NEXT:    v_max_f32_e64 v2, s0, s4
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s0, v2
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB3_5
-; GFX1132GISEL-NEXT:  .LBB3_6: ; %endif
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB4_5
+; GFX1132GISEL-NEXT:  .LBB4_6: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b32 v[0:1], v2, off
@@ -1415,11 +2293,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX12DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB3_4
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB4_4
 ; GFX12DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
-; GFX12DAGISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s3, s1
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
@@ -1430,19 +2308,19 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX12DAGISEL-NEXT:    v_max_num_f32_e64 v3, s2, s4
 ; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_2
+; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_2
 ; GFX12DAGISEL-NEXT:  ; %bb.3:
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX12DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX12DAGISEL-NEXT:  .LBB3_4: ; %Flow
+; GFX12DAGISEL-NEXT:  .LBB4_4: ; %Flow
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB3_8
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB4_8
 ; GFX12DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
-; GFX12DAGISEL-NEXT:  .LBB3_6: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT:  .LBB4_6: ; =>This Inner Loop Header: Depth=1
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s3, s1
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
@@ -1453,11 +2331,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX12DAGISEL-NEXT:    v_max_num_f32_e64 v2, s2, s4
 ; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_6
+; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_6
 ; GFX12DAGISEL-NEXT:  ; %bb.7:
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v4, s2
-; GFX12DAGISEL-NEXT:  .LBB3_8: ; %endif
+; GFX12DAGISEL-NEXT:  .LBB4_8: ; %endif
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12DAGISEL-NEXT:    global_store_b32 v[0:1], v4, off
@@ -1603,7 +2481,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s7, 0x7ff80000
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX8DAGISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[4:5]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
@@ -1614,7 +2492,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s6, v4
 ; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s7, v5
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX8DAGISEL-NEXT:  ; %bb.2:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
@@ -1628,7 +2506,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX8GISEL-NEXT:    s_mov_b32 s7, 0x7ff80000
 ; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX8GISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s10, s[4:5]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s10
@@ -1639,7 +2517,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v4
 ; GFX8GISEL-NEXT:    v_readfirstlane_b32 s7, v5
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX8GISEL-NEXT:  ; %bb.2:
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s7
@@ -1653,7 +2531,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s7, 0x7ff80000
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9DAGISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[4:5]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
@@ -1664,7 +2542,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s6, v4
 ; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s7, v5
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX9DAGISEL-NEXT:  ; %bb.2:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
@@ -1678,7 +2556,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX9GISEL-NEXT:    s_mov_b32 s7, 0x7ff80000
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9GISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s10, s[4:5]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s10
@@ -1689,7 +2567,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v4
 ; GFX9GISEL-NEXT:    v_readfirstlane_b32 s7, v5
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX9GISEL-NEXT:  ; %bb.2:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s7
@@ -1703,7 +2581,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s7, 0x7ff80000
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1064DAGISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[4:5]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
@@ -1712,7 +2590,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1064DAGISEL-NEXT:    v_max_f64 v[4:5], s[8:9], s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s6, v4
 ; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s7, v5
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1064DAGISEL-NEXT:  ; %bb.2:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
@@ -1725,7 +2603,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX1064GISEL-NEXT:    s_mov_b32 s7, 0x7ff80000
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1064GISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s10, s[4:5]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v3, s10
@@ -1734,7 +2612,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1064GISEL-NEXT:    v_max_f64 v[4:5], s[8:9], s[6:7]
 ; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v4
 ; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s7, v5
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1064GISEL-NEXT:  ; %bb.2:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s7
@@ -1747,7 +2625,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
@@ -1756,7 +2634,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1032DAGISEL-NEXT:    v_max_f64 v[4:5], s[8:9], s[4:5]
 ; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s4, v4
 ; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1032DAGISEL-NEXT:  ; %bb.2:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1769,7 +2647,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1032GISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
@@ -1778,7 +2656,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1032GISEL-NEXT:    v_max_f64 v[4:5], s[8:9], s[4:5]
 ; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s4, v4
 ; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1032GISEL-NEXT:  ; %bb.2:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1791,7 +2669,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s2, 0
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s3, 0x7ff80000
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164DAGISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s6, s[0:1]
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -1803,7 +2681,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s2, v4
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s3, v5
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1164DAGISEL-NEXT:  ; %bb.2:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
@@ -1816,7 +2694,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1164GISEL-NEXT:    s_mov_b32 s2, 0
 ; GFX1164GISEL-NEXT:    s_mov_b32 s3, 0x7ff80000
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164GISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s6, s[0:1]
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -1828,7 +2706,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s2, v4
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s3, v5
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1164GISEL-NEXT:  ; %bb.2:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s3
@@ -1841,7 +2719,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0x7ff80000
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -1853,7 +2731,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s1, v5
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1865,7 +2743,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX1132GISEL-NEXT:    s_mov_b32 s1, 0x7ff80000
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -1877,7 +2755,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s1, v5
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1132GISEL-NEXT:  ; %bb.2:
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1893,7 +2771,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s1, 0x7ff80000
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX12DAGISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
@@ -1906,7 +2784,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s1, v5
-; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX12DAGISEL-NEXT:  ; %bb.2:
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
@@ -1927,12 +2805,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB6_4
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s8, 0
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s9, 0x7ff80000
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s10, v2, s12
@@ -1943,20 +2821,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s8, v4
 ; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s9, v5
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_2
 ; GFX8DAGISEL-NEXT:  ; %bb.3:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, s8
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, s9
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX8DAGISEL-NEXT:  .LBB6_4: ; %Flow
+; GFX8DAGISEL-NEXT:  .LBB7_4: ; %Flow
 ; GFX8DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB6_8
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB7_8
 ; GFX8DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s8, 0
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s9, 0x7ff80000
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:  .LBB6_6: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB7_6: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s10, v4, s12
@@ -1967,11 +2845,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
 ; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s9, v3
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_6
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_6
 ; GFX8DAGISEL-NEXT:  ; %bb.7:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, s8
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, s9
-; GFX8DAGISEL-NEXT:  .LBB6_8: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB7_8: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[6:7]
 ; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -1985,12 +2863,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB6_3
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX8GISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX8GISEL-NEXT:    s_mov_b64 s[8:9], exec
-; GFX8GISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s12, s[8:9]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8GISEL-NEXT:    v_readlane_b32 s10, v2, s12
@@ -2003,15 +2881,15 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX8GISEL-NEXT:    v_readfirstlane_b32 s5, v5
 ; GFX8GISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX8GISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB6_2
-; GFX8GISEL-NEXT:  .LBB6_3: ; %Flow
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX8GISEL-NEXT:  .LBB7_3: ; %Flow
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB7_6
 ; GFX8GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX8GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX8GISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX8GISEL-NEXT:    s_mov_b64 s[8:9], exec
-; GFX8GISEL-NEXT:  .LBB6_5: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB7_5: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s12, s[8:9]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8GISEL-NEXT:    v_readlane_b32 s10, v4, s12
@@ -2022,8 +2900,8 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX8GISEL-NEXT:    v_readfirstlane_b32 s4, v2
 ; GFX8GISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB6_5
-; GFX8GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_5
+; GFX8GISEL-NEXT:  .LBB7_6: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2039,12 +2917,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB6_4
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s8, 0
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s9, 0x7ff80000
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s10, v2, s12
@@ -2055,20 +2933,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s8, v4
 ; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s9, v5
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_2
 ; GFX9DAGISEL-NEXT:  ; %bb.3:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, s8
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, s9
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX9DAGISEL-NEXT:  .LBB6_4: ; %Flow
+; GFX9DAGISEL-NEXT:  .LBB7_4: ; %Flow
 ; GFX9DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB6_8
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB7_8
 ; GFX9DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s8, 0
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s9, 0x7ff80000
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:  .LBB6_6: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB7_6: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s10, v4, s12
@@ -2079,11 +2957,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
 ; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s9, v3
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_6
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_6
 ; GFX9DAGISEL-NEXT:  ; %bb.7:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, s8
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, s9
-; GFX9DAGISEL-NEXT:  .LBB6_8: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB7_8: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off
 ; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -2097,12 +2975,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB6_3
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX9GISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX9GISEL-NEXT:    s_mov_b64 s[8:9], exec
-; GFX9GISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s12, s[8:9]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9GISEL-NEXT:    v_readlane_b32 s10, v2, s12
@@ -2115,15 +2993,15 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX9GISEL-NEXT:    v_readfirstlane_b32 s5, v5
 ; GFX9GISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX9GISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB6_2
-; GFX9GISEL-NEXT:  .LBB6_3: ; %Flow
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX9GISEL-NEXT:  .LBB7_3: ; %Flow
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB7_6
 ; GFX9GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX9GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX9GISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX9GISEL-NEXT:    s_mov_b64 s[8:9], exec
-; GFX9GISEL-NEXT:  .LBB6_5: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB7_5: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s12, s[8:9]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9GISEL-NEXT:    v_readlane_b32 s10, v4, s12
@@ -2134,8 +3012,8 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX9GISEL-NEXT:    v_readfirstlane_b32 s4, v2
 ; GFX9GISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB6_5
-; GFX9GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_5
+; GFX9GISEL-NEXT:  .LBB7_6: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2151,12 +3029,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB6_4
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s8, 0
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s9, 0x7ff80000
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s10, v2, s12
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s11, v3, s12
@@ -2165,20 +3043,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1064DAGISEL-NEXT:    v_max_f64 v[4:5], s[10:11], s[8:9]
 ; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s8, v4
 ; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s9, v5
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_2
 ; GFX1064DAGISEL-NEXT:  ; %bb.3:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, s8
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, s9
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX1064DAGISEL-NEXT:  .LBB6_4: ; %Flow
+; GFX1064DAGISEL-NEXT:  .LBB7_4: ; %Flow
 ; GFX1064DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB6_8
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB7_8
 ; GFX1064DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s8, 0
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s9, 0x7ff80000
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:  .LBB6_6: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB7_6: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s10, v4, s12
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s11, v5, s12
@@ -2187,11 +3065,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1064DAGISEL-NEXT:    v_max_f64 v[2:3], s[10:11], s[8:9]
 ; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
 ; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s9, v3
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_6
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_6
 ; GFX1064DAGISEL-NEXT:  ; %bb.7:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, s8
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, s9
-; GFX1064DAGISEL-NEXT:  .LBB6_8: ; %endif
+; GFX1064DAGISEL-NEXT:  .LBB7_8: ; %endif
 ; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off
 ; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2204,12 +3082,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v6
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1064GISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[8:9], exec
-; GFX1064GISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s12, s[8:9]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s10, v2, s12
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s11, v3, s12
@@ -2220,15 +3098,15 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s5, v5
 ; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB6_2
-; GFX1064GISEL-NEXT:  .LBB6_3: ; %Flow
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX1064GISEL-NEXT:  .LBB7_3: ; %Flow
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB7_6
 ; GFX1064GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX1064GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1064GISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[8:9], exec
-; GFX1064GISEL-NEXT:  .LBB6_5: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB7_5: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s12, s[8:9]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s10, v4, s12
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s11, v5, s12
@@ -2237,8 +3115,8 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1064GISEL-NEXT:    v_max_f64 v[2:3], s[10:11], s[4:5]
 ; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s4, v2
 ; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB6_5
-; GFX1064GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_5
+; GFX1064GISEL-NEXT:  .LBB7_6: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2253,12 +3131,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 s6, exec_lo, s4
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB6_4
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s7, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s10, s7
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
@@ -2267,20 +3145,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1032DAGISEL-NEXT:    v_max_f64 v[4:5], s[8:9], s[4:5]
 ; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s4, v4
 ; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_2
 ; GFX1032DAGISEL-NEXT:  ; %bb.3:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX1032DAGISEL-NEXT:  .LBB6_4: ; %Flow
+; GFX1032DAGISEL-NEXT:  .LBB7_4: ; %Flow
 ; GFX1032DAGISEL-NEXT:    s_andn2_saveexec_b32 s6, s6
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB6_8
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB7_8
 ; GFX1032DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s7, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB6_6: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB7_6: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s10, s7
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v4, s10
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v5, s10
@@ -2289,11 +3167,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1032DAGISEL-NEXT:    v_max_f64 v[2:3], s[8:9], s[4:5]
 ; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s4, v2
 ; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_6
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_6
 ; GFX1032DAGISEL-NEXT:  ; %bb.7:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, s4
-; GFX1032DAGISEL-NEXT:  .LBB6_8: ; %endif
+; GFX1032DAGISEL-NEXT:  .LBB7_8: ; %endif
 ; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off
 ; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2306,12 +3184,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v6
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s6, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s6, exec_lo, s6
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1032GISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX1032GISEL-NEXT:    s_mov_b32 s7, exec_lo
-; GFX1032GISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s10, s7
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s10
@@ -2322,15 +3200,15 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s5, v5
 ; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB6_2
-; GFX1032GISEL-NEXT:  .LBB6_3: ; %Flow
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX1032GISEL-NEXT:  .LBB7_3: ; %Flow
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s6, s6
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB7_6
 ; GFX1032GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1032GISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX1032GISEL-NEXT:    s_mov_b32 s7, exec_lo
-; GFX1032GISEL-NEXT:  .LBB6_5: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB7_5: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s10, s7
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v4, s10
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v5, s10
@@ -2339,8 +3217,8 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1032GISEL-NEXT:    v_max_f64 v[2:3], s[8:9], s[4:5]
 ; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s4, v2
 ; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB6_5
-; GFX1032GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_5
+; GFX1032GISEL-NEXT:  .LBB7_6: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2356,12 +3234,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX1164DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB6_4
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v2, s8
@@ -2373,20 +3251,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s4, v4
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_2
 ; GFX1164DAGISEL-NEXT:  ; %bb.3:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX1164DAGISEL-NEXT:  .LBB6_4: ; %Flow
+; GFX1164DAGISEL-NEXT:  .LBB7_4: ; %Flow
 ; GFX1164DAGISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB6_8
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB7_8
 ; GFX1164DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:  .LBB6_6: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB7_6: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v4, s8
@@ -2398,11 +3276,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s4, v2
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_6
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_6
 ; GFX1164DAGISEL-NEXT:  ; %bb.7:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, s4
-; GFX1164DAGISEL-NEXT:  .LBB6_8: ; %endif
+; GFX1164DAGISEL-NEXT:  .LBB7_8: ; %endif
 ; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    global_store_b64 v[0:1], v[6:7], off
 ; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2416,12 +3294,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v6
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX1164GISEL-NEXT:    s_mov_b32 s1, 0x7ff80000
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1164GISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s8, s[4:5]
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v2, s8
@@ -2435,15 +3313,15 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s1, v5
 ; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB6_2
-; GFX1164GISEL-NEXT:  .LBB6_3: ; %Flow
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX1164GISEL-NEXT:  .LBB7_3: ; %Flow
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[2:3]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB7_6
 ; GFX1164GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX1164GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX1164GISEL-NEXT:    s_mov_b32 s1, 0x7ff80000
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1164GISEL-NEXT:  .LBB6_5: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB7_5: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s8, s[4:5]
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v4, s8
@@ -2455,8 +3333,8 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s0, v2
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB6_5
-; GFX1164GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_5
+; GFX1164GISEL-NEXT:  .LBB7_6: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -2472,12 +3350,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX1132DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 s2, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB6_4
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0x7ff80000
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s6, s3
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -2489,19 +3367,19 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s1, v5
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_2
 ; GFX1132DAGISEL-NEXT:  ; %bb.3:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
 ; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX1132DAGISEL-NEXT:  .LBB6_4: ; %Flow
+; GFX1132DAGISEL-NEXT:  .LBB7_4: ; %Flow
 ; GFX1132DAGISEL-NEXT:    s_and_not1_saveexec_b32 s2, s2
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB6_8
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB7_8
 ; GFX1132DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0x7ff80000
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB6_6: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB7_6: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s6, s3
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v4, s6
@@ -2513,10 +3391,10 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s0, v2
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_6
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_6
 ; GFX1132DAGISEL-NEXT:  ; %bb.7:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX1132DAGISEL-NEXT:  .LBB6_8: ; %endif
+; GFX1132DAGISEL-NEXT:  .LBB7_8: ; %endif
 ; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[6:7], off
 ; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2530,12 +3408,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v6
 ; GFX1132GISEL-NEXT:    s_xor_b32 s2, exec_lo, s2
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX1132GISEL-NEXT:    s_mov_b32 s1, 0x7ff80000
 ; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX1132GISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s6, s3
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -2549,15 +3427,15 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s1, v5
 ; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB6_2
-; GFX1132GISEL-NEXT:  .LBB6_3: ; %Flow
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX1132GISEL-NEXT:  .LBB7_3: ; %Flow
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s2
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB7_6
 ; GFX1132GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX1132GISEL-NEXT:    s_mov_b32 s1, 0x7ff80000
 ; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX1132GISEL-NEXT:  .LBB6_5: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB7_5: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s6, s3
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v4, s6
@@ -2569,8 +3447,8 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s0, v2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB6_5
-; GFX1132GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_5
+; GFX1132GISEL-NEXT:  .LBB7_6: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -2590,12 +3468,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX12DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_xor_b32 s2, exec_lo, s0
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB6_4
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX12DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s1, 0x7ff80000
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX12DAGISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s6, s3
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
@@ -2608,21 +3486,21 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s1, v5
-; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_2
 ; GFX12DAGISEL-NEXT:  ; %bb.3:
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
 ; GFX12DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX12DAGISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX12DAGISEL-NEXT:  .LBB6_4: ; %Flow
+; GFX12DAGISEL-NEXT:  .LBB7_4: ; %Flow
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_and_not1_saveexec_b32 s2, s2
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB6_8
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB7_8
 ; GFX12DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s1, 0x7ff80000
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX12DAGISEL-NEXT:  .LBB6_6: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT:  .LBB7_6: ; =>This Inner Loop Header: Depth=1
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s6, s3
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
@@ -2635,11 +3513,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s0, v2
 ; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_6
+; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_6
 ; GFX12DAGISEL-NEXT:  ; %bb.7:
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX12DAGISEL-NEXT:  .LBB6_8: ; %endif
+; GFX12DAGISEL-NEXT:  .LBB7_8: ; %endif
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX12DAGISEL-NEXT:    global_store_b64 v[0:1], v[6:7], off
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmin.ll
index 88542989f5d83..86e9dad936ed8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmin.ll
@@ -858,6 +858,884 @@ entry:
   ret void
 }
 
+define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) {
+; GFX8DAGISEL-LABEL: divergent_value_double_dpp:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v2, s[4:5]
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v8, v6
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_min_f64 v[4:5], v[5:6], v[7:8]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_double_dpp:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v2, s[4:5]
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v8, v6
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_min_f64 v[4:5], v[5:6], v[7:8]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX8GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX8GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_double_dpp:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v2, s[4:5]
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_min_f64 v[4:5], v[5:6], v[7:8]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_double_dpp:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v2, s[4:5]
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_min_f64 v[4:5], v[5:6], v[7:8]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX9GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX9GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_double_dpp:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v3, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v6, -1, v8
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v6, 32, v6
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v6, 4, v6
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v7, v6, v4
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v8, v6, v5
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[7:8]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:20
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:24
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:28
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_double_dpp:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v3, s[4:5]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v6, -1, v8
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v6, 32, v6
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v6, 4, v6
+; GFX1064GISEL-NEXT:    ds_permute_b32 v7, v6, v4
+; GFX1064GISEL-NEXT:    ds_permute_b32 v8, v6, v5
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[7:8]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX1064GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
+; GFX1064GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1064GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064GISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:20
+; GFX1064GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:24
+; GFX1064GISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:28
+; GFX1064GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_double_dpp:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s6
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v3, s6
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s4, v4, 31
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s5, v5, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_double_dpp:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s6
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v3, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1032GISEL-NEXT:    v_readlane_b32 s4, v4, 31
+; GFX1032GISEL-NEXT:    v_readlane_b32 s5, v5, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1032GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1032GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_double_dpp:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_clause 0x3 ; 28-byte Folded Spill
+; GFX1164DAGISEL-NEXT:    scratch_store_b64 off, v[4:5], s32
+; GFX1164DAGISEL-NEXT:    scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:16
+; GFX1164DAGISEL-NEXT:    scratch_store_b64 off, v[7:8], s32 offset:20
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v3, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v6, -1, 0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v6, -1, v6
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v6, 32, v6
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v6, 4, v6
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v7, v6, v4
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v8, v6, v5
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[7:8]
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s2, v4, 63
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s3, v5, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1164DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-NEXT:    s_clause 0x3 ; 28-byte Folded Reload
+; GFX1164DAGISEL-NEXT:    scratch_load_b64 v[4:5], off, s32
+; GFX1164DAGISEL-NEXT:    scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:16
+; GFX1164DAGISEL-NEXT:    scratch_load_b64 v[7:8], off, s32 offset:20
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_double_dpp:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_clause 0x3 ; 28-byte Folded Spill
+; GFX1164GISEL-NEXT:    scratch_store_b64 off, v[4:5], s32
+; GFX1164GISEL-NEXT:    scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:16
+; GFX1164GISEL-NEXT:    scratch_store_b64 off, v[7:8], s32 offset:20
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v3, s[0:1]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v6, -1, 0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v6, -1, v6
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v6, 32, v6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v6, 4, v6
+; GFX1164GISEL-NEXT:    ds_permute_b32 v7, v6, v4
+; GFX1164GISEL-NEXT:    ds_permute_b32 v8, v6, v5
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[7:8]
+; GFX1164GISEL-NEXT:    v_readlane_b32 s2, v4, 63
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s3, v5, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1164GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164GISEL-NEXT:    s_clause 0x3 ; 28-byte Folded Reload
+; GFX1164GISEL-NEXT:    scratch_load_b64 v[4:5], off, s32
+; GFX1164GISEL-NEXT:    scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1164GISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:16
+; GFX1164GISEL-NEXT:    scratch_load_b64 v[7:8], off, s32 offset:20
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_double_dpp:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_clause 0x1 ; 16-byte Folded Spill
+; GFX1132DAGISEL-NEXT:    scratch_store_b64 off, v[4:5], s32
+; GFX1132DAGISEL-NEXT:    scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v3, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s0, v4, 31
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s1, v5, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_clause 0x1 ; 16-byte Folded Reload
+; GFX1132DAGISEL-NEXT:    scratch_load_b64 v[4:5], off, s32
+; GFX1132DAGISEL-NEXT:    scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_double_dpp:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_clause 0x1 ; 16-byte Folded Spill
+; GFX1132GISEL-NEXT:    scratch_store_b64 off, v[4:5], s32
+; GFX1132GISEL-NEXT:    scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v3, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132GISEL-NEXT:    v_readlane_b32 s0, v4, 31
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s1, v5, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_clause 0x1 ; 16-byte Folded Reload
+; GFX1132GISEL-NEXT:    scratch_load_b64 v[4:5], off, s32
+; GFX1132GISEL-NEXT:    scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12DAGISEL-LABEL: divergent_value_double_dpp:
+; GFX12DAGISEL:       ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12DAGISEL-NEXT:    s_clause 0x1 ; 16-byte Folded Spill
+; GFX12DAGISEL-NEXT:    scratch_store_b64 off, v[4:5], s32
+; GFX12DAGISEL-NEXT:    scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX12DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v3, s2
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[6:7]
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[6:7]
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[6:7]
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[6:7]
+; GFX12DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX12DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX12DAGISEL-NEXT:    s_wait_dscnt 0x0
+; GFX12DAGISEL-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[6:7]
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT:    v_readlane_b32 s0, v4, 31
+; GFX12DAGISEL-NEXT:    v_readlane_b32 s1, v5, 31
+; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12DAGISEL-NEXT:    s_clause 0x1 ; 16-byte Folded Reload
+; GFX12DAGISEL-NEXT:    scratch_load_b64 v[4:5], off, s32
+; GFX12DAGISEL-NEXT:    scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call double @llvm.amdgcn.wave.reduce.fmin(double %in, i32 2)
+  store double %result, ptr addrspace(1) %out
+  ret void
+}
+
 define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX8DAGISEL-LABEL: divergent_cfg_float:
 ; GFX8DAGISEL:       ; %bb.0: ; %entry
@@ -867,11 +1745,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB3_4
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB4_4
 ; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX8DAGISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s10, v2, s9
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s10
@@ -879,17 +1757,17 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX8DAGISEL-NEXT:    v_min_f32_e32 v3, s8, v3
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s8, v3
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_2
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_2
 ; GFX8DAGISEL-NEXT:  ; %bb.3:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX8DAGISEL-NEXT:  .LBB3_4: ; %Flow
+; GFX8DAGISEL-NEXT:  .LBB4_4: ; %Flow
 ; GFX8DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB3_8
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB4_8
 ; GFX8DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX8DAGISEL-NEXT:  .LBB3_6: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB4_6: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s10, v3, s9
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s10
@@ -897,10 +1775,10 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX8DAGISEL-NEXT:    v_min_f32_e32 v2, s8, v2
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_6
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_6
 ; GFX8DAGISEL-NEXT:  ; %bb.7:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
-; GFX8DAGISEL-NEXT:  .LBB3_8: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB4_8: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v4
 ; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -914,12 +1792,12 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr8
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB3_3
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
 ; GFX8GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX8GISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s10, v2, s9
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v4, s10
@@ -927,14 +1805,14 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX8GISEL-NEXT:    v_min_f32_e32 v4, s8, v4
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX8GISEL-NEXT:    v_readfirstlane_b32 s8, v4
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB3_2
-; GFX8GISEL-NEXT:  .LBB3_3: ; %Flow
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB4_2
+; GFX8GISEL-NEXT:  .LBB4_3: ; %Flow
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB3_6
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB4_6
 ; GFX8GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX8GISEL-NEXT:  .LBB3_5: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB4_5: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s10, v3, s9
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s10
@@ -942,8 +1820,8 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX8GISEL-NEXT:    v_min_f32_e32 v2, s8, v2
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX8GISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB3_5
-; GFX8GISEL-NEXT:  .LBB3_6: ; %endif
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB4_5
+; GFX8GISEL-NEXT:  .LBB4_6: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
@@ -958,11 +1836,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB3_4
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB4_4
 ; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX9DAGISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s10, v2, s9
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s10
@@ -970,17 +1848,17 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX9DAGISEL-NEXT:    v_min_f32_e32 v3, s8, v3
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s8, v3
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_2
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_2
 ; GFX9DAGISEL-NEXT:  ; %bb.3:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX9DAGISEL-NEXT:  .LBB3_4: ; %Flow
+; GFX9DAGISEL-NEXT:  .LBB4_4: ; %Flow
 ; GFX9DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB3_8
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB4_8
 ; GFX9DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX9DAGISEL-NEXT:  .LBB3_6: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB4_6: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s10, v3, s9
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s10
@@ -988,10 +1866,10 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX9DAGISEL-NEXT:    v_min_f32_e32 v2, s8, v2
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_6
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_6
 ; GFX9DAGISEL-NEXT:  ; %bb.7:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9DAGISEL-NEXT:  .LBB3_8: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB4_8: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9DAGISEL-NEXT:    global_store_dword v[0:1], v4, off
 ; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -1005,12 +1883,12 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr8
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB3_3
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
 ; GFX9GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX9GISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s10, v2, s9
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v4, s10
@@ -1018,14 +1896,14 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX9GISEL-NEXT:    v_min_f32_e32 v4, s8, v4
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX9GISEL-NEXT:    v_readfirstlane_b32 s8, v4
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB3_2
-; GFX9GISEL-NEXT:  .LBB3_3: ; %Flow
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB4_2
+; GFX9GISEL-NEXT:  .LBB4_3: ; %Flow
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB3_6
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB4_6
 ; GFX9GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX9GISEL-NEXT:  .LBB3_5: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB4_5: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s10, v3, s9
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s10
@@ -1033,8 +1911,8 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX9GISEL-NEXT:    v_min_f32_e32 v2, s8, v2
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX9GISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB3_5
-; GFX9GISEL-NEXT:  .LBB3_6: ; %endif
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB4_5
+; GFX9GISEL-NEXT:  .LBB4_6: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9GISEL-NEXT:    global_store_dword v[0:1], v2, off
@@ -1049,38 +1927,38 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB3_4
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB4_4
 ; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX1064DAGISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s10, v2, s9
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX1064DAGISEL-NEXT:    v_min_f32_e64 v3, s8, s10
 ; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s8, v3
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_2
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_2
 ; GFX1064DAGISEL-NEXT:  ; %bb.3:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1064DAGISEL-NEXT:  .LBB3_4: ; %Flow
+; GFX1064DAGISEL-NEXT:  .LBB4_4: ; %Flow
 ; GFX1064DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB3_8
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB4_8
 ; GFX1064DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX1064DAGISEL-NEXT:  .LBB3_6: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB4_6: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s10, v3, s9
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX1064DAGISEL-NEXT:    v_min_f32_e64 v2, s8, s10
 ; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_6
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_6
 ; GFX1064DAGISEL-NEXT:  ; %bb.7:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
-; GFX1064DAGISEL-NEXT:  .LBB3_8: ; %endif
+; GFX1064DAGISEL-NEXT:  .LBB4_8: ; %endif
 ; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064DAGISEL-NEXT:    global_store_dword v[0:1], v4, off
 ; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1093,11 +1971,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v4
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1064GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX1064GISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s10, v2, s9
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
@@ -1105,22 +1983,22 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1064GISEL-NEXT:    v_min_f32_e64 v3, s8, s10
 ; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s8, v3
 ; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB3_2
-; GFX1064GISEL-NEXT:  .LBB3_3: ; %Flow
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB4_2
+; GFX1064GISEL-NEXT:  .LBB4_3: ; %Flow
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB3_6
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB4_6
 ; GFX1064GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1064GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX1064GISEL-NEXT:  .LBB3_5: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB4_5: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s10, v3, s9
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX1064GISEL-NEXT:    v_min_f32_e64 v2, s8, s10
 ; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB3_5
-; GFX1064GISEL-NEXT:  .LBB3_6: ; %endif
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB4_5
+; GFX1064GISEL-NEXT:  .LBB4_6: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX1064GISEL-NEXT:    global_store_dword v[0:1], v2, off
@@ -1134,38 +2012,38 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 s4, exec_lo, s4
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB3_4
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB4_4
 ; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
-; GFX1032DAGISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s5
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s5, s7
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX1032DAGISEL-NEXT:    v_min_f32_e64 v3, s6, s8
 ; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_2
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_2
 ; GFX1032DAGISEL-NEXT:  ; %bb.3:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1032DAGISEL-NEXT:  .LBB3_4: ; %Flow
+; GFX1032DAGISEL-NEXT:  .LBB4_4: ; %Flow
 ; GFX1032DAGISEL-NEXT:    s_andn2_saveexec_b32 s4, s4
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB3_8
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB4_8
 ; GFX1032DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
-; GFX1032DAGISEL-NEXT:  .LBB3_6: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB4_6: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s5
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v3, s7
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s5, s7
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX1032DAGISEL-NEXT:    v_min_f32_e64 v2, s6, s8
 ; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_6
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_6
 ; GFX1032DAGISEL-NEXT:  ; %bb.7:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v4, s6
-; GFX1032DAGISEL-NEXT:  .LBB3_8: ; %endif
+; GFX1032DAGISEL-NEXT:  .LBB4_8: ; %endif
 ; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032DAGISEL-NEXT:    global_store_dword v[0:1], v4, off
 ; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1178,11 +2056,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v4
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s5, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s5, exec_lo, s5
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1032GISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
@@ -1190,22 +2068,22 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1032GISEL-NEXT:    v_min_f32_e64 v3, s4, s8
 ; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s4, v3
 ; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB3_2
-; GFX1032GISEL-NEXT:  .LBB3_3: ; %Flow
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_2
+; GFX1032GISEL-NEXT:  .LBB4_3: ; %Flow
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s5, s5
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB3_6
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB4_6
 ; GFX1032GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1032GISEL-NEXT:  .LBB3_5: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB4_5: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v3, s7
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX1032GISEL-NEXT:    v_min_f32_e64 v2, s4, s8
 ; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB3_5
-; GFX1032GISEL-NEXT:  .LBB3_6: ; %endif
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_5
+; GFX1032GISEL-NEXT:  .LBB4_6: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    global_store_dword v[0:1], v2, off
@@ -1220,11 +2098,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1164DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB3_4
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB4_4
 ; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1164DAGISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v2, s5
@@ -1233,17 +2111,17 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1164DAGISEL-NEXT:    v_min_f32_e64 v3, s4, s6
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_2
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_2
 ; GFX1164DAGISEL-NEXT:  ; %bb.3:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1164DAGISEL-NEXT:  .LBB3_4: ; %Flow
+; GFX1164DAGISEL-NEXT:  .LBB4_4: ; %Flow
 ; GFX1164DAGISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB3_8
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB4_8
 ; GFX1164DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1164DAGISEL-NEXT:  .LBB3_6: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB4_6: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v3, s5
@@ -1252,10 +2130,10 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1164DAGISEL-NEXT:    v_min_f32_e64 v2, s4, s6
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_6
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_6
 ; GFX1164DAGISEL-NEXT:  ; %bb.7:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX1164DAGISEL-NEXT:  .LBB3_8: ; %endif
+; GFX1164DAGISEL-NEXT:  .LBB4_8: ; %endif
 ; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1269,11 +2147,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v4
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1164GISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v2, s5
@@ -1283,14 +2161,14 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s4, v3
 ; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB3_2
-; GFX1164GISEL-NEXT:  .LBB3_3: ; %Flow
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB4_2
+; GFX1164GISEL-NEXT:  .LBB4_3: ; %Flow
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB3_6
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB4_6
 ; GFX1164GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1164GISEL-NEXT:  .LBB3_5: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB4_5: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v3, s5
@@ -1299,8 +2177,8 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1164GISEL-NEXT:    v_min_f32_e64 v2, s4, s6
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB3_5
-; GFX1164GISEL-NEXT:  .LBB3_6: ; %endif
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB4_5
+; GFX1164GISEL-NEXT:  .LBB4_6: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164GISEL-NEXT:    global_store_b32 v[0:1], v2, off
@@ -1315,11 +2193,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1132DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB3_4
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB4_4
 ; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
-; GFX1132DAGISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -1328,17 +2206,17 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1132DAGISEL-NEXT:    v_min_f32_e64 v3, s2, s4
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_2
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_2
 ; GFX1132DAGISEL-NEXT:  ; %bb.3:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1132DAGISEL-NEXT:  .LBB3_4: ; %Flow
+; GFX1132DAGISEL-NEXT:  .LBB4_4: ; %Flow
 ; GFX1132DAGISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB3_8
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB4_8
 ; GFX1132DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
-; GFX1132DAGISEL-NEXT:  .LBB3_6: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB4_6: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v3, s3
@@ -1347,10 +2225,10 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1132DAGISEL-NEXT:    v_min_f32_e64 v2, s2, s4
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_6
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_6
 ; GFX1132DAGISEL-NEXT:  ; %bb.7:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v4, s2
-; GFX1132DAGISEL-NEXT:  .LBB3_8: ; %endif
+; GFX1132DAGISEL-NEXT:  .LBB4_8: ; %endif
 ; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1364,11 +2242,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v4
 ; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0x7fc00000
-; GFX1132GISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -1378,14 +2256,14 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s0, v3
 ; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB3_2
-; GFX1132GISEL-NEXT:  .LBB3_3: ; %Flow
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB4_2
+; GFX1132GISEL-NEXT:  .LBB4_3: ; %Flow
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB3_6
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB4_6
 ; GFX1132GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0x7fc00000
-; GFX1132GISEL-NEXT:  .LBB3_5: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB4_5: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v3, s3
@@ -1394,8 +2272,8 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX1132GISEL-NEXT:    v_min_f32_e64 v2, s0, s4
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s0, v2
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB3_5
-; GFX1132GISEL-NEXT:  .LBB3_6: ; %endif
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB4_5
+; GFX1132GISEL-NEXT:  .LBB4_6: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b32 v[0:1], v2, off
@@ -1415,11 +2293,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX12DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB3_4
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB4_4
 ; GFX12DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
-; GFX12DAGISEL-NEXT:  .LBB3_2: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT:  .LBB4_2: ; =>This Inner Loop Header: Depth=1
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s3, s1
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
@@ -1430,19 +2308,19 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX12DAGISEL-NEXT:    v_min_num_f32_e64 v3, s2, s4
 ; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_2
+; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_2
 ; GFX12DAGISEL-NEXT:  ; %bb.3:
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX12DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX12DAGISEL-NEXT:  .LBB3_4: ; %Flow
+; GFX12DAGISEL-NEXT:  .LBB4_4: ; %Flow
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB3_8
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB4_8
 ; GFX12DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
-; GFX12DAGISEL-NEXT:  .LBB3_6: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT:  .LBB4_6: ; =>This Inner Loop Header: Depth=1
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s3, s1
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
@@ -1453,11 +2331,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
 ; GFX12DAGISEL-NEXT:    v_min_num_f32_e64 v2, s2, s4
 ; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_6
+; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_6
 ; GFX12DAGISEL-NEXT:  ; %bb.7:
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v4, s2
-; GFX12DAGISEL-NEXT:  .LBB3_8: ; %endif
+; GFX12DAGISEL-NEXT:  .LBB4_8: ; %endif
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12DAGISEL-NEXT:    global_store_b32 v[0:1], v4, off
@@ -1603,7 +2481,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s7, 0x7ff80000
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX8DAGISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[4:5]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
@@ -1614,7 +2492,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s6, v4
 ; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s7, v5
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX8DAGISEL-NEXT:  ; %bb.2:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
@@ -1628,7 +2506,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX8GISEL-NEXT:    s_mov_b32 s7, 0x7ff80000
 ; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX8GISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s10, s[4:5]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s10
@@ -1639,7 +2517,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v4
 ; GFX8GISEL-NEXT:    v_readfirstlane_b32 s7, v5
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX8GISEL-NEXT:  ; %bb.2:
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s7
@@ -1653,7 +2531,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s7, 0x7ff80000
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9DAGISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[4:5]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
@@ -1664,7 +2542,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s6, v4
 ; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s7, v5
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX9DAGISEL-NEXT:  ; %bb.2:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
@@ -1678,7 +2556,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX9GISEL-NEXT:    s_mov_b32 s7, 0x7ff80000
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9GISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s10, s[4:5]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s10
@@ -1689,7 +2567,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v4
 ; GFX9GISEL-NEXT:    v_readfirstlane_b32 s7, v5
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX9GISEL-NEXT:  ; %bb.2:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s7
@@ -1703,7 +2581,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s7, 0x7ff80000
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1064DAGISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[4:5]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
@@ -1712,7 +2590,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1064DAGISEL-NEXT:    v_min_f64 v[4:5], s[8:9], s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s6, v4
 ; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s7, v5
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1064DAGISEL-NEXT:  ; %bb.2:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
@@ -1725,7 +2603,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX1064GISEL-NEXT:    s_mov_b32 s7, 0x7ff80000
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1064GISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s10, s[4:5]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v3, s10
@@ -1734,7 +2612,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1064GISEL-NEXT:    v_min_f64 v[4:5], s[8:9], s[6:7]
 ; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v4
 ; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s7, v5
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1064GISEL-NEXT:  ; %bb.2:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s7
@@ -1747,7 +2625,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
@@ -1756,7 +2634,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1032DAGISEL-NEXT:    v_min_f64 v[4:5], s[8:9], s[4:5]
 ; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s4, v4
 ; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1032DAGISEL-NEXT:  ; %bb.2:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1769,7 +2647,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1032GISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
@@ -1778,7 +2656,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1032GISEL-NEXT:    v_min_f64 v[4:5], s[8:9], s[4:5]
 ; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s4, v4
 ; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1032GISEL-NEXT:  ; %bb.2:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1791,7 +2669,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s2, 0
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s3, 0x7ff80000
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164DAGISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s6, s[0:1]
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -1803,7 +2681,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s2, v4
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s3, v5
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1164DAGISEL-NEXT:  ; %bb.2:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
@@ -1816,7 +2694,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1164GISEL-NEXT:    s_mov_b32 s2, 0
 ; GFX1164GISEL-NEXT:    s_mov_b32 s3, 0x7ff80000
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164GISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s6, s[0:1]
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -1828,7 +2706,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s2, v4
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s3, v5
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1164GISEL-NEXT:  ; %bb.2:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s3
@@ -1841,7 +2719,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0x7ff80000
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -1853,7 +2731,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s1, v5
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1865,7 +2743,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX1132GISEL-NEXT:    s_mov_b32 s1, 0x7ff80000
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -1877,7 +2755,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s1, v5
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1132GISEL-NEXT:  ; %bb.2:
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1893,7 +2771,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s1, 0x7ff80000
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX12DAGISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
@@ -1906,7 +2784,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) {
 ; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s1, v5
-; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX12DAGISEL-NEXT:  ; %bb.2:
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
@@ -1927,12 +2805,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB6_4
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s8, 0
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s9, 0x7ff80000
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s10, v2, s12
@@ -1943,20 +2821,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s8, v4
 ; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s9, v5
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_2
 ; GFX8DAGISEL-NEXT:  ; %bb.3:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, s8
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, s9
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX8DAGISEL-NEXT:  .LBB6_4: ; %Flow
+; GFX8DAGISEL-NEXT:  .LBB7_4: ; %Flow
 ; GFX8DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB6_8
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB7_8
 ; GFX8DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s8, 0
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s9, 0x7ff80000
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:  .LBB6_6: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB7_6: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s10, v4, s12
@@ -1967,11 +2845,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
 ; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s9, v3
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_6
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_6
 ; GFX8DAGISEL-NEXT:  ; %bb.7:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, s8
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, s9
-; GFX8DAGISEL-NEXT:  .LBB6_8: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB7_8: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[6:7]
 ; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -1985,12 +2863,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB6_3
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX8GISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX8GISEL-NEXT:    s_mov_b64 s[8:9], exec
-; GFX8GISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s12, s[8:9]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8GISEL-NEXT:    v_readlane_b32 s10, v2, s12
@@ -2003,15 +2881,15 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX8GISEL-NEXT:    v_readfirstlane_b32 s5, v5
 ; GFX8GISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX8GISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB6_2
-; GFX8GISEL-NEXT:  .LBB6_3: ; %Flow
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX8GISEL-NEXT:  .LBB7_3: ; %Flow
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB7_6
 ; GFX8GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX8GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX8GISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX8GISEL-NEXT:    s_mov_b64 s[8:9], exec
-; GFX8GISEL-NEXT:  .LBB6_5: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB7_5: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s12, s[8:9]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8GISEL-NEXT:    v_readlane_b32 s10, v4, s12
@@ -2022,8 +2900,8 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX8GISEL-NEXT:    v_readfirstlane_b32 s4, v2
 ; GFX8GISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB6_5
-; GFX8GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_5
+; GFX8GISEL-NEXT:  .LBB7_6: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2039,12 +2917,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB6_4
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s8, 0
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s9, 0x7ff80000
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s10, v2, s12
@@ -2055,20 +2933,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s8, v4
 ; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s9, v5
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_2
 ; GFX9DAGISEL-NEXT:  ; %bb.3:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, s8
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, s9
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX9DAGISEL-NEXT:  .LBB6_4: ; %Flow
+; GFX9DAGISEL-NEXT:  .LBB7_4: ; %Flow
 ; GFX9DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB6_8
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB7_8
 ; GFX9DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s8, 0
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s9, 0x7ff80000
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:  .LBB6_6: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB7_6: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s10, v4, s12
@@ -2079,11 +2957,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
 ; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s9, v3
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_6
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_6
 ; GFX9DAGISEL-NEXT:  ; %bb.7:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, s8
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, s9
-; GFX9DAGISEL-NEXT:  .LBB6_8: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB7_8: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off
 ; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -2097,12 +2975,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB6_3
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX9GISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX9GISEL-NEXT:    s_mov_b64 s[8:9], exec
-; GFX9GISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s12, s[8:9]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9GISEL-NEXT:    v_readlane_b32 s10, v2, s12
@@ -2115,15 +2993,15 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX9GISEL-NEXT:    v_readfirstlane_b32 s5, v5
 ; GFX9GISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX9GISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB6_2
-; GFX9GISEL-NEXT:  .LBB6_3: ; %Flow
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX9GISEL-NEXT:  .LBB7_3: ; %Flow
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB7_6
 ; GFX9GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX9GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX9GISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX9GISEL-NEXT:    s_mov_b64 s[8:9], exec
-; GFX9GISEL-NEXT:  .LBB6_5: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB7_5: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s12, s[8:9]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9GISEL-NEXT:    v_readlane_b32 s10, v4, s12
@@ -2134,8 +3012,8 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX9GISEL-NEXT:    v_readfirstlane_b32 s4, v2
 ; GFX9GISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB6_5
-; GFX9GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_5
+; GFX9GISEL-NEXT:  .LBB7_6: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2151,12 +3029,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB6_4
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s8, 0
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s9, 0x7ff80000
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s10, v2, s12
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s11, v3, s12
@@ -2165,20 +3043,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1064DAGISEL-NEXT:    v_min_f64 v[4:5], s[10:11], s[8:9]
 ; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s8, v4
 ; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s9, v5
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_2
 ; GFX1064DAGISEL-NEXT:  ; %bb.3:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, s8
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, s9
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX1064DAGISEL-NEXT:  .LBB6_4: ; %Flow
+; GFX1064DAGISEL-NEXT:  .LBB7_4: ; %Flow
 ; GFX1064DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB6_8
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB7_8
 ; GFX1064DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s8, 0
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s9, 0x7ff80000
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:  .LBB6_6: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB7_6: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s10, v4, s12
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s11, v5, s12
@@ -2187,11 +3065,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1064DAGISEL-NEXT:    v_min_f64 v[2:3], s[10:11], s[8:9]
 ; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
 ; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s9, v3
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_6
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_6
 ; GFX1064DAGISEL-NEXT:  ; %bb.7:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, s8
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, s9
-; GFX1064DAGISEL-NEXT:  .LBB6_8: ; %endif
+; GFX1064DAGISEL-NEXT:  .LBB7_8: ; %endif
 ; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off
 ; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2204,12 +3082,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v6
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1064GISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[8:9], exec
-; GFX1064GISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s12, s[8:9]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s10, v2, s12
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s11, v3, s12
@@ -2220,15 +3098,15 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s5, v5
 ; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB6_2
-; GFX1064GISEL-NEXT:  .LBB6_3: ; %Flow
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX1064GISEL-NEXT:  .LBB7_3: ; %Flow
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB7_6
 ; GFX1064GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX1064GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1064GISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[8:9], exec
-; GFX1064GISEL-NEXT:  .LBB6_5: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB7_5: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s12, s[8:9]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s10, v4, s12
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s11, v5, s12
@@ -2237,8 +3115,8 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1064GISEL-NEXT:    v_min_f64 v[2:3], s[10:11], s[4:5]
 ; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s4, v2
 ; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB6_5
-; GFX1064GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_5
+; GFX1064GISEL-NEXT:  .LBB7_6: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2253,12 +3131,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 s6, exec_lo, s4
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB6_4
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s7, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s10, s7
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
@@ -2267,20 +3145,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1032DAGISEL-NEXT:    v_min_f64 v[4:5], s[8:9], s[4:5]
 ; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s4, v4
 ; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_2
 ; GFX1032DAGISEL-NEXT:  ; %bb.3:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX1032DAGISEL-NEXT:  .LBB6_4: ; %Flow
+; GFX1032DAGISEL-NEXT:  .LBB7_4: ; %Flow
 ; GFX1032DAGISEL-NEXT:    s_andn2_saveexec_b32 s6, s6
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB6_8
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB7_8
 ; GFX1032DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s7, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB6_6: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB7_6: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s10, s7
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v4, s10
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v5, s10
@@ -2289,11 +3167,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1032DAGISEL-NEXT:    v_min_f64 v[2:3], s[8:9], s[4:5]
 ; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s4, v2
 ; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_6
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_6
 ; GFX1032DAGISEL-NEXT:  ; %bb.7:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, s4
-; GFX1032DAGISEL-NEXT:  .LBB6_8: ; %endif
+; GFX1032DAGISEL-NEXT:  .LBB7_8: ; %endif
 ; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off
 ; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2306,12 +3184,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v6
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s6, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s6, exec_lo, s6
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1032GISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX1032GISEL-NEXT:    s_mov_b32 s7, exec_lo
-; GFX1032GISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s10, s7
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s10
@@ -2322,15 +3200,15 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s5, v5
 ; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB6_2
-; GFX1032GISEL-NEXT:  .LBB6_3: ; %Flow
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX1032GISEL-NEXT:  .LBB7_3: ; %Flow
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s6, s6
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB7_6
 ; GFX1032GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1032GISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX1032GISEL-NEXT:    s_mov_b32 s7, exec_lo
-; GFX1032GISEL-NEXT:  .LBB6_5: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB7_5: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s10, s7
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v4, s10
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v5, s10
@@ -2339,8 +3217,8 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1032GISEL-NEXT:    v_min_f64 v[2:3], s[8:9], s[4:5]
 ; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s4, v2
 ; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB6_5
-; GFX1032GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_5
+; GFX1032GISEL-NEXT:  .LBB7_6: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2356,12 +3234,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX1164DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB6_4
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v2, s8
@@ -2373,20 +3251,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s4, v4
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_2
 ; GFX1164DAGISEL-NEXT:  ; %bb.3:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX1164DAGISEL-NEXT:  .LBB6_4: ; %Flow
+; GFX1164DAGISEL-NEXT:  .LBB7_4: ; %Flow
 ; GFX1164DAGISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB6_8
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB7_8
 ; GFX1164DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s5, 0x7ff80000
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:  .LBB6_6: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB7_6: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v4, s8
@@ -2398,11 +3276,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s4, v2
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_6
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_6
 ; GFX1164DAGISEL-NEXT:  ; %bb.7:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, s4
-; GFX1164DAGISEL-NEXT:  .LBB6_8: ; %endif
+; GFX1164DAGISEL-NEXT:  .LBB7_8: ; %endif
 ; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    global_store_b64 v[0:1], v[6:7], off
 ; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2416,12 +3294,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v6
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX1164GISEL-NEXT:    s_mov_b32 s1, 0x7ff80000
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1164GISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s8, s[4:5]
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v2, s8
@@ -2435,15 +3313,15 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s1, v5
 ; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB6_2
-; GFX1164GISEL-NEXT:  .LBB6_3: ; %Flow
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX1164GISEL-NEXT:  .LBB7_3: ; %Flow
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[2:3]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB7_6
 ; GFX1164GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX1164GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX1164GISEL-NEXT:    s_mov_b32 s1, 0x7ff80000
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1164GISEL-NEXT:  .LBB6_5: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB7_5: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s8, s[4:5]
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v4, s8
@@ -2455,8 +3333,8 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s0, v2
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB6_5
-; GFX1164GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_5
+; GFX1164GISEL-NEXT:  .LBB7_6: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -2472,12 +3350,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX1132DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 s2, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB6_4
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0x7ff80000
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s6, s3
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -2489,19 +3367,19 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s1, v5
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_2
 ; GFX1132DAGISEL-NEXT:  ; %bb.3:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
 ; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX1132DAGISEL-NEXT:  .LBB6_4: ; %Flow
+; GFX1132DAGISEL-NEXT:  .LBB7_4: ; %Flow
 ; GFX1132DAGISEL-NEXT:    s_and_not1_saveexec_b32 s2, s2
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB6_8
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB7_8
 ; GFX1132DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0x7ff80000
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB6_6: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB7_6: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s6, s3
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v4, s6
@@ -2513,10 +3391,10 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s0, v2
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_6
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_6
 ; GFX1132DAGISEL-NEXT:  ; %bb.7:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX1132DAGISEL-NEXT:  .LBB6_8: ; %endif
+; GFX1132DAGISEL-NEXT:  .LBB7_8: ; %endif
 ; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[6:7], off
 ; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2530,12 +3408,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v6
 ; GFX1132GISEL-NEXT:    s_xor_b32 s2, exec_lo, s2
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX1132GISEL-NEXT:    s_mov_b32 s1, 0x7ff80000
 ; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX1132GISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s6, s3
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -2549,15 +3427,15 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s1, v5
 ; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB6_2
-; GFX1132GISEL-NEXT:  .LBB6_3: ; %Flow
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX1132GISEL-NEXT:  .LBB7_3: ; %Flow
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s2
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB7_6
 ; GFX1132GISEL-NEXT:  ; %bb.4: ; %if
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX1132GISEL-NEXT:    s_mov_b32 s1, 0x7ff80000
 ; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX1132GISEL-NEXT:  .LBB6_5: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB7_5: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s6, s3
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v4, s6
@@ -2569,8 +3447,8 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s0, v2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB6_5
-; GFX1132GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_5
+; GFX1132GISEL-NEXT:  .LBB7_6: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -2590,12 +3468,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX12DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_xor_b32 s2, exec_lo, s0
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB6_4
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX12DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s1, 0x7ff80000
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX12DAGISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s6, s3
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
@@ -2608,21 +3486,21 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s1, v5
-; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_2
 ; GFX12DAGISEL-NEXT:  ; %bb.3:
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
 ; GFX12DAGISEL-NEXT:    ; implicit-def: $vgpr4
 ; GFX12DAGISEL-NEXT:    ; implicit-def: $vgpr5
-; GFX12DAGISEL-NEXT:  .LBB6_4: ; %Flow
+; GFX12DAGISEL-NEXT:  .LBB7_4: ; %Flow
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_and_not1_saveexec_b32 s2, s2
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB6_8
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB7_8
 ; GFX12DAGISEL-NEXT:  ; %bb.5: ; %if
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s1, 0x7ff80000
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX12DAGISEL-NEXT:  .LBB6_6: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT:  .LBB7_6: ; =>This Inner Loop Header: Depth=1
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s6, s3
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
@@ -2635,11 +3513,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
 ; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s0, v2
 ; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_6
+; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_6
 ; GFX12DAGISEL-NEXT:  ; %bb.7:
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX12DAGISEL-NEXT:  .LBB6_8: ; %endif
+; GFX12DAGISEL-NEXT:  .LBB7_8: ; %endif
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX12DAGISEL-NEXT:    global_store_b64 v[0:1], v[6:7], off



More information about the llvm-branch-commits mailing list