[llvm] [AMDGPU] DPP implementations for Wave Reduction (PR #185814)

via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 11 02:07:42 PDT 2026


https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/185814

>From d4d9b5ab5d988ba14c1c6e1bb9bb0f882a5e32e3 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Tue, 3 Feb 2026 15:14:35 +0530
Subject: [PATCH] [AMDGPU] DPP implementations for Wave Reduction

Adding DPP reduction support for i32 types.
Supported Ops: `umin`, `min`, `umax`, `max`,
`add`, `sub`, `and`, `or`, `xor`.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  697 ++--
 llvm/lib/Target/AMDGPU/SIInstructions.td      |    2 +-
 .../AMDGPU/amdgpu-cs-chain-fp-nosave.ll       |  293 +-
 .../test/CodeGen/AMDGPU/dynamic_stackalloc.ll | 3451 ++++++++++-------
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll  | 1189 +++++-
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll  |  994 ++++-
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll  | 1002 ++++-
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll  | 1002 ++++-
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll   |  994 ++++-
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll  | 1242 +++++-
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll |  994 ++++-
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll |  994 ++++-
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll  | 1118 +++++-
 llvm/test/CodeGen/AMDGPU/llvm.sponentry.ll    |  199 +-
 14 files changed, 11309 insertions(+), 2862 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 932d6a5841aab..f8693952b145d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4652,7 +4652,7 @@ SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     SDValue WaveReduction =
         DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
     Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
-                       Size, DAG.getConstant(0, dl, MVT::i32));
+                       Size, DAG.getTargetConstant(0, dl, MVT::i32));
     SDValue ScaledSize = DAG.getNode(
         ISD::SHL, dl, VT, Size,
         DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
@@ -5636,6 +5636,32 @@ static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
          Opc == AMDGPU::V_ADD_F64_e64 || Opc == AMDGPU::V_ADD_F64_pseudo_e64;
 }
 
+static unsigned getDPPOpcForWaveReduction(unsigned Opc,
+                                          const GCNSubtarget &ST) {
+  switch (Opc) {
+  case AMDGPU::S_MIN_U32:
+    return AMDGPU::V_MIN_U32_dpp;
+  case AMDGPU::S_MIN_I32:
+    return AMDGPU::V_MIN_I32_dpp;
+  case AMDGPU::S_MAX_U32:
+    return AMDGPU::V_MAX_U32_dpp;
+  case AMDGPU::S_MAX_I32:
+    return AMDGPU::V_MAX_I32_dpp;
+  case AMDGPU::S_ADD_I32:
+  case AMDGPU::S_SUB_I32:
+    return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_dpp
+                                   : AMDGPU::V_ADD_CO_U32_dpp;
+  case AMDGPU::S_AND_B32:
+    return AMDGPU::V_AND_B32_dpp;
+  case AMDGPU::S_OR_B32:
+    return AMDGPU::V_OR_B32_dpp;
+  case AMDGPU::S_XOR_B32:
+    return AMDGPU::V_XOR_B32_dpp;
+  default:
+    llvm_unreachable("unhandled lane op");
+  }
+}
+
 static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
                                           MachineBasicBlock &BB,
                                           const GCNSubtarget &ST,
@@ -5649,6 +5675,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
   Register SrcReg = MI.getOperand(1).getReg();
   bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
   Register DstReg = MI.getOperand(0).getReg();
+  assert(MI.getOperand(2).isImm() &&
+         "Stratergy argument for wave reduction must be a constant.");
+  unsigned Stratergy = static_cast<unsigned>(MI.getOperand(2).getImm());
+  enum WAVE_REDUCE_STRATEGY : unsigned { DEFAULT = 0, ITERATIVE = 1, DPP = 2 };
   MachineBasicBlock *RetBB = nullptr;
   if (isSGPR) {
     switch (Opc) {
@@ -5915,267 +5945,450 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
     }
     }
   } else {
-    // TODO: Implement DPP Strategy and switch based on immediate strategy
-    // operand. For now, for all the cases (default, Iterative and DPP we use
-    // iterative approach by default.)
-
-    // To reduce the VGPR using iterative approach, we need to iterate
-    // over all the active lanes. Lowering consists of ComputeLoop,
-    // which iterate over only active lanes. We use copy of EXEC register
-    // as induction variable and every active lane modifies it using bitset0
-    // so that we will get the next active lane for next iteration.
     MachineBasicBlock::iterator I = BB.end();
     Register SrcReg = MI.getOperand(1).getReg();
     bool is32BitOpc = is32bitWaveReduceOperation(Opc);
     bool isFPOp = isFloatingPointWaveReduceOperation(Opc);
-
-    // Create Control flow for loop
-    // Split MI's Machine Basic block into For loop
-    auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
-
     // Create virtual registers required for lowering.
     const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
     const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
-    Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
-    Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
-    Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
-    Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
-    Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
-    Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-    Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
-
+    const TargetRegisterClass *SrcRegClass = MRI.getRegClass(SrcReg);
     bool IsWave32 = ST.isWave32();
     unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
     unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-
-    // Create initial values of induction variable from Exec, Accumulator and
-    // insert branch instr to newly created ComputeBlock
-    BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
-    if (is32BitOpc) {
-      uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc);
-      BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
-          .addImm(IdentityValue);
-    } else {
-      uint64_t IdentityValue =
-          MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
-              ? 0x0 // +0.0 for double sub reduction
-              : getIdentityValueFor64BitWaveReduction(Opc);
-      BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
-          .addImm(IdentityValue);
-    }
-    // clang-format off
+    if (Stratergy == WAVE_REDUCE_STRATEGY::ITERATIVE ||
+        !ST.hasDPP()) { // If target doesn't support DPP operations, default to
+                        // iterative stratergy
+
+      // To reduce the VGPR using iterative approach, we need to iterate
+      // over all the active lanes. Lowering consists of ComputeLoop,
+      // which iterate over only active lanes. We use copy of EXEC register
+      // as induction variable and every active lane modifies it using bitset0
+      // so that we will get the next active lane for next iteration.
+
+      // Create Control flow for loop
+      // Split MI's Machine Basic block into For loop
+      auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
+
+      Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
+      Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
+      Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
+      Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
+      Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
+      Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+      Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
+
+      // Create initial values of induction variable from Exec, Accumulator and
+      // insert branch instr to newly created ComputeBlock
+      BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
+      if (is32BitOpc) {
+        uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc);
+        BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
+            .addImm(IdentityValue);
+      } else {
+        uint64_t IdentityValue =
+            MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
+                ? 0x0 // +0.0 for double sub reduction
+                : getIdentityValueFor64BitWaveReduction(Opc);
+        BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO),
+                IdentityValReg)
+            .addImm(IdentityValue);
+      }
+      // clang-format off
     BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
         .addMBB(ComputeLoop);
-    // clang-format on
-
-    // Start constructing ComputeLoop
-    I = ComputeLoop->begin();
-    auto Accumulator =
-        BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
-            .addReg(IdentityValReg)
-            .addMBB(&BB);
-    auto ActiveBits =
-        BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
-            .addReg(LoopIterator)
-            .addMBB(&BB);
-
-    I = ComputeLoop->end();
-    MachineInstr *NewAccumulator;
-    // Perform the computations
-    unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
-    BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
-        .addReg(ActiveBitsReg);
-    if (is32BitOpc) {
-      BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
-              LaneValueReg)
-          .addReg(SrcReg)
-          .addReg(FF1Reg);
-      if (isFPOp) {
-        Register LaneValVreg =
-            MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
-        Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
-        // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
-        BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
-                LaneValVreg)
-            .addReg(LaneValueReg);
-        BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
-            .addImm(0) // src0 modifier
-            .addReg(Accumulator->getOperand(0).getReg())
-            .addImm(0) // src1 modifier
-            .addReg(LaneValVreg)
-            .addImm(0)  // clamp
-            .addImm(0); // omod
-        NewAccumulator = BuildMI(*ComputeLoop, I, DL,
-                                 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
-                             .addReg(DstVreg);
+      // clang-format on
+
+      // Start constructing ComputeLoop
+      I = ComputeLoop->begin();
+      auto Accumulator =
+          BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
+              .addReg(IdentityValReg)
+              .addMBB(&BB);
+      auto ActiveBits =
+          BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
+              .addReg(LoopIterator)
+              .addMBB(&BB);
+
+      I = ComputeLoop->end();
+      MachineInstr *NewAccumulator;
+      // Perform the computations
+      unsigned SFFOpc =
+          IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
+      BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
+          .addReg(ActiveBitsReg);
+      if (is32BitOpc) {
+        BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+                LaneValueReg)
+            .addReg(SrcReg)
+            .addReg(FF1Reg);
+        if (isFPOp) {
+          Register LaneValVreg =
+              MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
+          Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
+          // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
+          BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
+                  LaneValVreg)
+              .addReg(LaneValueReg);
+          BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
+              .addImm(0) // src0 modifier
+              .addReg(Accumulator->getOperand(0).getReg())
+              .addImm(0) // src1 modifier
+              .addReg(LaneValVreg)
+              .addImm(0)  // clamp
+              .addImm(0); // omod
+          NewAccumulator =
+              BuildMI(*ComputeLoop, I, DL,
+                      TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+                  .addReg(DstVreg);
+        } else {
+          NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+                               .addReg(Accumulator->getOperand(0).getReg())
+                               .addReg(LaneValueReg);
+        }
       } else {
-        NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
-                             .addReg(Accumulator->getOperand(0).getReg())
-                             .addReg(LaneValueReg);
-      }
-    } else {
-      Register LaneValueLoReg =
-          MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
-      Register LaneValueHiReg =
-          MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
-      Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
-      const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
-      const TargetRegisterClass *SrcSubRC =
-          TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
-      MachineOperand Op1L = TII->buildExtractSubRegOrImm(
-          MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
-      MachineOperand Op1H = TII->buildExtractSubRegOrImm(
-          MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
-      // lane value input should be in an sgpr
-      BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
-              LaneValueLoReg)
-          .add(Op1L)
-          .addReg(FF1Reg);
-      BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
-              LaneValueHiReg)
-          .add(Op1H)
-          .addReg(FF1Reg);
-      auto LaneValue = BuildMI(*ComputeLoop, I, DL,
-                               TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
-                           .addReg(LaneValueLoReg)
-                           .addImm(AMDGPU::sub0)
-                           .addReg(LaneValueHiReg)
-                           .addImm(AMDGPU::sub1);
-      switch (Opc) {
-      case AMDGPU::S_OR_B64:
-      case AMDGPU::S_AND_B64:
-      case AMDGPU::S_XOR_B64: {
-        NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
-                             .addReg(Accumulator->getOperand(0).getReg())
-                             .addReg(LaneValue->getOperand(0).getReg())
-                             .setOperandDead(3); // Dead scc
-        break;
-      }
-      case AMDGPU::V_CMP_GT_I64_e64:
-      case AMDGPU::V_CMP_GT_U64_e64:
-      case AMDGPU::V_CMP_LT_I64_e64:
-      case AMDGPU::V_CMP_LT_U64_e64: {
-        Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
-        Register ComparisonResultReg =
-            MRI.createVirtualRegister(WaveMaskRegClass);
-        int SrcIdx =
-            AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
-        const TargetRegisterClass *VregClass =
-            TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
-        const TargetRegisterClass *VSubRegClass =
-            TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
-        Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
-        MachineOperand SrcReg0Sub0 =
-            TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
-                                         VregClass, AMDGPU::sub0, VSubRegClass);
-        MachineOperand SrcReg0Sub1 =
-            TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
-                                         VregClass, AMDGPU::sub1, VSubRegClass);
-        BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
-                AccumulatorVReg)
-            .add(SrcReg0Sub0)
-            .addImm(AMDGPU::sub0)
-            .add(SrcReg0Sub1)
-            .addImm(AMDGPU::sub1);
-        BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
-            .addReg(LaneValue->getOperand(0).getReg())
-            .addReg(AccumulatorVReg);
-
-        unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
-        BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
-            .addReg(LaneMaskReg)
-            .addReg(ActiveBitsReg);
-
-        NewAccumulator = BuildMI(*ComputeLoop, I, DL,
-                                 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
-                             .addReg(LaneValue->getOperand(0).getReg())
-                             .addReg(Accumulator->getOperand(0).getReg());
-        break;
-      }
-      case AMDGPU::V_MIN_F64_e64:
-      case AMDGPU::V_MIN_NUM_F64_e64:
-      case AMDGPU::V_MAX_F64_e64:
-      case AMDGPU::V_MAX_NUM_F64_e64:
-      case AMDGPU::V_ADD_F64_e64:
-      case AMDGPU::V_ADD_F64_pseudo_e64: {
-        int SrcIdx =
-            AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
-        const TargetRegisterClass *VregRC =
-            TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
-        const TargetRegisterClass *VregSubRC =
-            TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
-        Register AccumulatorVReg = MRI.createVirtualRegister(VregRC);
-        Register DstVreg = MRI.createVirtualRegister(VregRC);
-        Register LaneValLo =
+        Register LaneValueLoReg =
             MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
-        Register LaneValHi =
+        Register LaneValueHiReg =
             MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
-        BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::COPY), AccumulatorVReg)
-            .addReg(Accumulator->getOperand(0).getReg());
-        unsigned Modifier =
-            MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
-                ? SISrcMods::NEG
-                : SISrcMods::NONE;
-        auto DstVregInst = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
-                               .addImm(Modifier) // src0 modifiers
+        Register LaneValReg =
+            MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+        const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+        const TargetRegisterClass *SrcSubRC =
+            TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
+        MachineOperand Op1L = TII->buildExtractSubRegOrImm(
+            MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
+        MachineOperand Op1H = TII->buildExtractSubRegOrImm(
+            MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
+        // lane value input should be in an sgpr
+        BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+                LaneValueLoReg)
+            .add(Op1L)
+            .addReg(FF1Reg);
+        BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+                LaneValueHiReg)
+            .add(Op1H)
+            .addReg(FF1Reg);
+        auto LaneValue =
+            BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
+                    LaneValReg)
+                .addReg(LaneValueLoReg)
+                .addImm(AMDGPU::sub0)
+                .addReg(LaneValueHiReg)
+                .addImm(AMDGPU::sub1);
+        switch (Opc) {
+        case AMDGPU::S_OR_B64:
+        case AMDGPU::S_AND_B64:
+        case AMDGPU::S_XOR_B64: {
+          NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+                               .addReg(Accumulator->getOperand(0).getReg())
                                .addReg(LaneValue->getOperand(0).getReg())
-                               .addImm(SISrcMods::NONE) // src1 modifiers
-                               .addReg(AccumulatorVReg)
-                               .addImm(SISrcMods::NONE)  // clamp
-                               .addImm(SISrcMods::NONE); // omod
-        auto ReadLaneLo =
-            BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
-                    LaneValLo);
-        auto ReadLaneHi =
-            BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
-                    LaneValHi);
-        MachineBasicBlock::iterator Iters = *ReadLaneLo;
-        MachineOperand Op1L =
-            TII->buildExtractSubRegOrImm(Iters, MRI, DstVregInst->getOperand(0),
-                                         VregRC, AMDGPU::sub0, VregSubRC);
-        MachineOperand Op1H =
-            TII->buildExtractSubRegOrImm(Iters, MRI, DstVregInst->getOperand(0),
-                                         VregRC, AMDGPU::sub1, VregSubRC);
-        ReadLaneLo.add(Op1L);
-        ReadLaneHi.add(Op1H);
-        NewAccumulator = BuildMI(*ComputeLoop, I, DL,
-                                 TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
-                             .addReg(LaneValLo)
-                             .addImm(AMDGPU::sub0)
-                             .addReg(LaneValHi)
-                             .addImm(AMDGPU::sub1);
-        break;
+                               .setOperandDead(3); // Dead scc
+          break;
+        }
+        case AMDGPU::V_CMP_GT_I64_e64:
+        case AMDGPU::V_CMP_GT_U64_e64:
+        case AMDGPU::V_CMP_LT_I64_e64:
+        case AMDGPU::V_CMP_LT_U64_e64: {
+          Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
+          Register ComparisonResultReg =
+              MRI.createVirtualRegister(WaveMaskRegClass);
+          int SrcIdx =
+              AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
+          const TargetRegisterClass *VregClass =
+              TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
+          const TargetRegisterClass *VSubRegClass =
+              TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
+          Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
+          MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
+              MI, MRI, Accumulator->getOperand(0), VregClass, AMDGPU::sub0,
+              VSubRegClass);
+          MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
+              MI, MRI, Accumulator->getOperand(0), VregClass, AMDGPU::sub1,
+              VSubRegClass);
+          BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
+                  AccumulatorVReg)
+              .add(SrcReg0Sub0)
+              .addImm(AMDGPU::sub0)
+              .add(SrcReg0Sub1)
+              .addImm(AMDGPU::sub1);
+          BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
+              .addReg(LaneValue->getOperand(0).getReg())
+              .addReg(AccumulatorVReg);
+
+          unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+          BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
+              .addReg(LaneMaskReg)
+              .addReg(ActiveBitsReg);
+
+          NewAccumulator = BuildMI(*ComputeLoop, I, DL,
+                                   TII->get(AMDGPU::S_CSELECT_B64), DstReg)
+                               .addReg(LaneValue->getOperand(0).getReg())
+                               .addReg(Accumulator->getOperand(0).getReg());
+          break;
+        }
+        case AMDGPU::V_MIN_F64_e64:
+        case AMDGPU::V_MIN_NUM_F64_e64:
+        case AMDGPU::V_MAX_F64_e64:
+        case AMDGPU::V_MAX_NUM_F64_e64:
+        case AMDGPU::V_ADD_F64_e64:
+        case AMDGPU::V_ADD_F64_pseudo_e64: {
+          int SrcIdx =
+              AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
+          const TargetRegisterClass *VregRC =
+              TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
+          const TargetRegisterClass *VregSubRC =
+              TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
+          Register AccumulatorVReg = MRI.createVirtualRegister(VregRC);
+          Register DstVreg = MRI.createVirtualRegister(VregRC);
+          Register LaneValLo =
+              MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+          Register LaneValHi =
+              MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+          BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::COPY), AccumulatorVReg)
+              .addReg(Accumulator->getOperand(0).getReg());
+          unsigned Modifier =
+              MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
+                  ? SISrcMods::NEG
+                  : SISrcMods::NONE;
+          auto DstVregInst =
+              BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
+                  .addImm(Modifier) // src0 modifiers
+                  .addReg(LaneValue->getOperand(0).getReg())
+                  .addImm(SISrcMods::NONE) // src1 modifiers
+                  .addReg(AccumulatorVReg)
+                  .addImm(SISrcMods::NONE)  // clamp
+                  .addImm(SISrcMods::NONE); // omod
+          auto ReadLaneLo =
+              BuildMI(*ComputeLoop, I, DL,
+                      TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValLo);
+          auto ReadLaneHi =
+              BuildMI(*ComputeLoop, I, DL,
+                      TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValHi);
+          MachineBasicBlock::iterator Iters = *ReadLaneLo;
+          MachineOperand Op1L = TII->buildExtractSubRegOrImm(
+              Iters, MRI, DstVregInst->getOperand(0), VregRC, AMDGPU::sub0,
+              VregSubRC);
+          MachineOperand Op1H = TII->buildExtractSubRegOrImm(
+              Iters, MRI, DstVregInst->getOperand(0), VregRC, AMDGPU::sub1,
+              VregSubRC);
+          ReadLaneLo.add(Op1L);
+          ReadLaneHi.add(Op1H);
+          NewAccumulator = BuildMI(*ComputeLoop, I, DL,
+                                   TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
+                               .addReg(LaneValLo)
+                               .addImm(AMDGPU::sub0)
+                               .addReg(LaneValHi)
+                               .addImm(AMDGPU::sub1);
+          break;
+        }
+        case AMDGPU::S_ADD_U64_PSEUDO:
+        case AMDGPU::S_SUB_U64_PSEUDO: {
+          NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+                               .addReg(Accumulator->getOperand(0).getReg())
+                               .addReg(LaneValue->getOperand(0).getReg());
+          ComputeLoop =
+              Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
+          break;
+        }
+        }
       }
-      case AMDGPU::S_ADD_U64_PSEUDO:
-      case AMDGPU::S_SUB_U64_PSEUDO: {
-        NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
-                             .addReg(Accumulator->getOperand(0).getReg())
-                             .addReg(LaneValue->getOperand(0).getReg());
-        ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
-        break;
+      // Manipulate the iterator to get the next active lane
+      unsigned BITSETOpc =
+          IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
+      BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
+          .addReg(FF1Reg)
+          .addReg(ActiveBitsReg);
+
+      // Add phi nodes
+      Accumulator.addReg(DstReg).addMBB(ComputeLoop);
+      ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
+
+      // Creating branching
+      unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
+      BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
+          .addReg(NewActiveBitsReg)
+          .addImm(0);
+      BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
+          .addMBB(ComputeLoop);
+
+      RetBB = ComputeEnd;
+    } else {
+      assert(ST.hasDPP() && "Sub Target does not support DPP Operations");
+
+      Register SrcWithIdentity = MRI.createVirtualRegister(SrcRegClass);
+      Register IdentityVGPR = MRI.createVirtualRegister(SrcRegClass);
+      Register IdentitySGPR = MRI.createVirtualRegister(DstRegClass);
+      Register DPPRowShr1 = MRI.createVirtualRegister(SrcRegClass);
+      Register DPPRowShr2 = MRI.createVirtualRegister(SrcRegClass);
+      Register DPPRowShr4 = MRI.createVirtualRegister(SrcRegClass);
+      Register DPPRowShr8 = MRI.createVirtualRegister(SrcRegClass);
+      Register RowBcast15 = MRI.createVirtualRegister(SrcRegClass);
+      Register ReducedValSGPR = MRI.createVirtualRegister(DstRegClass);
+      Register NegatedReducedVal = MRI.createVirtualRegister(DstRegClass);
+      Register RowBcast31 = MRI.createVirtualRegister(SrcRegClass);
+      Register FinalDPPResult;
+      Register UndefExec = MRI.createVirtualRegister(
+          IsWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
+                   : &AMDGPU::SReg_64_XEXECRegClass);
+      BuildMI(BB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), UndefExec);
+
+      uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc);
+      BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), IdentitySGPR)
+          .addImm(IdentityValue);
+      BuildMI(BB, MI, DL, TII->get(AMDGPU::COPY), IdentityVGPR)
+          .addReg(IdentitySGPR);
+
+      // Set inactive lanes to the identity value.
+      BuildMI(BB, MI, DL, TII->get(AMDGPU::V_SET_INACTIVE_B32), SrcWithIdentity)
+          .addImm(0)            // src0 modifiers
+          .addReg(SrcReg)       // src0
+          .addImm(0)            // src1 modifiers
+          .addReg(IdentityVGPR) // identity value for inactive lanes
+          .addReg(UndefExec);   // bool i1
+
+      unsigned DPPOpc = getDPPOpcForWaveReduction(Opc, ST);
+      auto GetVALUOpc = [](unsigned Opc) -> unsigned {
+        switch (Opc) {
+        case AMDGPU::S_MIN_U32:
+          return AMDGPU::V_MIN_U32_e64;
+        case AMDGPU::S_MIN_I32:
+          return AMDGPU::V_MIN_I32_e64;
+        case AMDGPU::S_MAX_U32:
+          return AMDGPU::V_MAX_U32_e64;
+        case AMDGPU::S_MAX_I32:
+          return AMDGPU::V_MAX_I32_e64;
+        case AMDGPU::S_ADD_I32:
+          return AMDGPU::V_ADD_I32_e64;
+        case AMDGPU::S_SUB_I32:
+          return AMDGPU::V_SUB_I32_e64;
+        case AMDGPU::S_AND_B32:
+          return AMDGPU::V_AND_B32_e64;
+        case AMDGPU::S_OR_B32:
+          return AMDGPU::V_OR_B32_e64;
+        case AMDGPU::S_XOR_B32:
+          return AMDGPU::V_XOR_B32_e64;
+        default:
+          return Opc;
+        }
+      };
+      auto BuildDPPMachineInstr = [&](Register Dst, Register Src,
+                                      unsigned DPPCtrl, unsigned RowMask,
+                                      unsigned BankMask, unsigned BoundCtrl) {
+        BuildMI(BB, MI, DL, TII->get(DPPOpc), Dst)
+            .addReg(Src)        // old
+            .addReg(Src)        // src0
+            .addReg(Src)        // src1
+            .addImm(DPPCtrl)    // dpp-ctrl
+            .addImm(RowMask)    // row-mask
+            .addImm(BankMask)   // bank-mask
+            .addImm(BoundCtrl); // bound-control
+      };
+      // DPP reduction
+      BuildDPPMachineInstr(DPPRowShr1, SrcWithIdentity,
+                           AMDGPU::DPP::ROW_SHR_FIRST, 0xf, 0xf, 0);
+
+      BuildDPPMachineInstr(DPPRowShr2, DPPRowShr1,
+                           (AMDGPU::DPP::ROW_SHR_FIRST + 1), 0xf, 0xf, 0);
+
+      BuildDPPMachineInstr(DPPRowShr4, DPPRowShr2,
+                           (AMDGPU::DPP::ROW_SHR_FIRST + 3), 0xf, 0xf, 0);
+
+      BuildDPPMachineInstr(DPPRowShr8, DPPRowShr4,
+                           (AMDGPU::DPP::ROW_SHR_FIRST + 7), 0xf, 0xf, 0);
+
+      if (ST.hasDPPBroadcasts()) {
+        BuildDPPMachineInstr(RowBcast15, DPPRowShr8, AMDGPU::DPP::BCAST15, 0xf,
+                             0xf, 0);
+      } else {
+        // magic constant: 0x1E0
+        // To Set BIT_MODE : bit 15 = 0
+        // XOR mask : bit [14:10] = 0
+        // OR mask : bit [9:5] = 15
+        // AND mask : bit [4:0] = 0
+        Register SwizzledValue =
+            MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32), SwizzledValue)
+            .addReg(DPPRowShr8) // addr
+            .addImm(0x1E0)      // swizzle offset (i16)
+            .addImm(0x0);       // gds (i1)
+        auto ClampInstr =
+            BuildMI(BB, MI, DL, TII->get(GetVALUOpc(Opc)), RowBcast15)
+                .addReg(DPPRowShr8)
+                .addReg(SwizzledValue);
+        if (TII->hasIntClamp(*ClampInstr) || TII->hasFPClamp(*ClampInstr))
+          ClampInstr.addImm(0);
       }
+      FinalDPPResult = RowBcast15;
+      if (!IsWave32) {
+        if (ST.hasDPPBroadcasts()) {
+          BuildDPPMachineInstr(RowBcast31, RowBcast15, AMDGPU::DPP::BCAST31,
+                               0xf, 0xf, 0);
+        } else {
+          Register ShiftedThreadID =
+              MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+          Register PermuteByteOffset =
+              MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+          Register PermutedValue =
+              MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+          Register Lane32Offset = MRI.createVirtualRegister(DstRegClass);
+          Register WordSizeConst = MRI.createVirtualRegister(DstRegClass);
+          Register ThreadIDRegLo =
+              MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+          Register ThreadIDReg =
+              MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+          // Get the thread ID.
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
+                  ThreadIDRegLo)
+              .addImm(-1)
+              .addImm(0);
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
+                  ThreadIDReg)
+              .addImm(-1)
+              .addReg(ThreadIDRegLo);
+          // shift each lane over by 32 positions, so value in 31st lane is
+          // present in 63rd lane.
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), Lane32Offset)
+              .addImm(0x20);
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), ShiftedThreadID)
+              .addReg(ThreadIDReg)
+              .addReg(Lane32Offset)
+              .addImm(0); // clamp
+          // multiply by reg size.
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), WordSizeConst)
+              .addImm(0x4);
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64),
+                  PermuteByteOffset)
+              .addReg(WordSizeConst)
+              .addReg(ShiftedThreadID);
+          // Permute the lanes
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32), PermutedValue)
+              .addReg(PermuteByteOffset) // addr
+              .addReg(RowBcast15)        // data
+              .addImm(0);                // offset
+          auto ClampInstr =
+              BuildMI(BB, MI, DL, TII->get(GetVALUOpc(Opc)), RowBcast31)
+                  .addReg(DPPRowShr8)
+                  .addReg(PermutedValue);
+          if (TII->hasIntClamp(*ClampInstr) || TII->hasFPClamp(*ClampInstr))
+            ClampInstr.addImm(0);
+        }
+        FinalDPPResult = RowBcast31;
       }
+      // The final reduced value is in the last lane.
+      BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), ReducedValSGPR)
+          .addReg(FinalDPPResult)
+          .addImm(IsWave32 ? 31 : 63);
+      if (Opc == AMDGPU::S_SUB_I32)
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedReducedVal)
+            .addImm(0)
+            .addReg(ReducedValSGPR);
+      // Mark the final result as a whole-wave-mode calculation.
+      BuildMI(BB, MI, DL, TII->get(AMDGPU::STRICT_WWM), DstReg)
+          .addReg(Opc == AMDGPU::S_SUB_I32 ? NegatedReducedVal
+                                           : ReducedValSGPR);
+      RetBB = &BB;
     }
-    // Manipulate the iterator to get the next active lane
-    unsigned BITSETOpc =
-        IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
-    BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
-        .addReg(FF1Reg)
-        .addReg(ActiveBitsReg);
-
-    // Add phi nodes
-    Accumulator.addReg(DstReg).addMBB(ComputeLoop);
-    ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
-
-    // Creating branching
-    unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
-    BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
-        .addReg(NewActiveBitsReg)
-        .addImm(0);
-    BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
-        .addMBB(ComputeLoop);
-
-    RetBB = ComputeEnd;
   }
   MI.eraseFromParent();
   return RetBB;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 80bc12762425c..c789a0742b5a2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -354,7 +354,7 @@ multiclass
   let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, UseNamedOperandTable = 1, Uses = [EXEC] in {
     def !toupper(Op) #"_PSEUDO_" #DataType
         : VPseudoInstSI<(outs RetReg : $sdst),
-                        (ins Reg : $src, VSrc_b32 : $strategy),
+                        (ins Reg : $src, i32imm : $strategy),
                         [(set ty : $sdst, (!cast<AMDGPUWaveReduce>("int_amdgcn_wave_reduce_" #Op) ty : $src, i32 : $strategy))]> {}
   }
 }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
index 236fcc95f223f..d0aee6fe13135 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
@@ -133,9 +133,9 @@ define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) {
 ; GFX942-NEXT:    s_mov_b32 s33, s32
 ; GFX942-NEXT:    s_add_i32 s32, s32, 16
 ; GFX942-NEXT:    s_and_b32 s0, s0, -16
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_lshl_b32 s0, s0, 6
 ; GFX942-NEXT:    s_mov_b32 s1, s32
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_add_i32 s32, s1, s0
 ; GFX942-NEXT:    scratch_store_dword off, v0, s1
 ; GFX942-NEXT:    s_endpgm
@@ -152,54 +152,74 @@ define amdgpu_cs_chain void @test_alloca_var(i32 %count) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
-; GFX12-NEXT:    s_mov_b64 s[0:1], exec
-; GFX12-NEXT:    s_mov_b32 s2, 0
+; GFX12-NEXT:    v_lshl_add_u32 v3, v8, 2, 15
 ; GFX12-NEXT:    s_mov_b32 s33, s32
 ; GFX12-NEXT:    s_add_co_i32 s32, s32, 16
-; GFX12-NEXT:    v_and_b32_e32 v1, -16, v0
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_ctz_i32_b64 s3, s[0:1]
-; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    v_readlane_b32 s4, v1, s3
-; GFX12-NEXT:    s_bitset0_b64 s[0:1], s3
-; GFX12-NEXT:    s_max_u32 s2, s2, s4
-; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX12-NEXT:    s_cbranch_scc1 .LBB3_1
-; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v3, -16, v3
+; GFX12-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, v3, s[0:1]
+; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v2, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX12-NEXT:    v_mbcnt_hi_u32_b32 v2, -1, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX12-NEXT:    v_add_nc_u32_e32 v2, 32, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX12-NEXT:    v_mul_lo_u32 v2, 4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX12-NEXT:    ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15)
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    v_max_u32_e32 v1, v0, v1
+; GFX12-NEXT:    ds_permute_b32 v1, v2, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    v_max_u32_e32 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s2, v0, 63
+; GFX12-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX12-NEXT:    s_mov_b32 s0, s32
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    v_lshl_add_u32 v1, s2, 6, s0
-; GFX12-NEXT:    scratch_store_b32 off, v0, s0
-; GFX12-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX12-NEXT:    v_lshl_add_u32 v3, s2, 6, s0
+; GFX12-NEXT:    scratch_store_b32 off, v4, s0
+; GFX12-NEXT:    v_readfirstlane_b32 s32, v3
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: test_alloca_var:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
-; GFX942-NEXT:    v_and_b32_e32 v1, -16, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_mov_b64 s[0:1], exec
-; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    v_lshl_add_u32 v1, v8, 2, 15
 ; GFX942-NEXT:    s_mov_b32 s33, s32
 ; GFX942-NEXT:    s_add_i32 s32, s32, 16
-; GFX942-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX942-NEXT:    v_readlane_b32 s4, v1, s3
-; GFX942-NEXT:    s_bitset0_b64 s[0:1], s3
-; GFX942-NEXT:    s_max_u32 s2, s2, s4
-; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX942-NEXT:    s_cbranch_scc1 .LBB3_1
-; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX942-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[0:1]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_max_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_max_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readlane_b32 s2, v0, 63
+; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    s_mov_b32 s0, s32
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX942-NEXT:    v_lshl_add_u32 v1, s2, 6, v1
-; GFX942-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    scratch_store_dword off, v1, s0
 ; GFX942-NEXT:    s_endpgm
   %v = alloca i32, i32 %count, align 4, addrspace(5)
   store i32 0, ptr addrspace(5) %v, align 4
@@ -302,8 +322,8 @@ define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count)
 ; GFX942-NEXT:    s_add_u32 s0, s0, foo at gotpcrel32@lo+4
 ; GFX942-NEXT:    s_addc_u32 s1, s1, foo at gotpcrel32@hi+12
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_mov_b32 s3, s32
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_add_i32 s32, s3, s2
 ; GFX942-NEXT:    scratch_store_dword off, v0, s3
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
@@ -323,35 +343,50 @@ define amdgpu_cs_chain void @test_alloca_and_call_var(i32 %count) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
-; GFX12-NEXT:    s_mov_b64 s[0:1], exec
-; GFX12-NEXT:    s_mov_b32 s2, 0
+; GFX12-NEXT:    v_lshl_add_u32 v3, v8, 2, 15
 ; GFX12-NEXT:    s_mov_b32 s33, s32
 ; GFX12-NEXT:    s_add_co_i32 s32, s32, 16
-; GFX12-NEXT:    v_and_b32_e32 v1, -16, v0
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v3, -16, v3
+; GFX12-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, v3, s[0:1]
+; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v2, -1, 0
+; GFX12-NEXT:    s_getpc_b64 s[2:3]
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    v_readlane_b32 s4, v1, s3
-; GFX12-NEXT:    s_bitset0_b64 s[0:1], s3
-; GFX12-NEXT:    s_max_u32 s2, s2, s4
+; GFX12-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-NEXT:    s_add_co_u32 s2, s2, foo at gotpcrel32@lo+12
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX12-NEXT:    s_cbranch_scc1 .LBB6_1
-; GFX12-NEXT:  ; %bb.2:
-; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo at gotpcrel32@hi+24
+; GFX12-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX12-NEXT:    v_mbcnt_hi_u32_b32 v2, -1, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX12-NEXT:    v_add_nc_u32_e32 v2, 32, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX12-NEXT:    v_mul_lo_u32 v2, 4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX12-NEXT:    ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15)
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    v_max_u32_e32 v1, v0, v1
+; GFX12-NEXT:    ds_permute_b32 v1, v2, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    v_max_u32_e32 v0, v0, v1
+; GFX12-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s4, v0, 63
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_sext_i32_i16 s1, s1
-; GFX12-NEXT:    s_add_co_u32 s0, s0, foo at gotpcrel32@lo+12
+; GFX12-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX12-NEXT:    s_mov_b32 s2, s32
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, foo at gotpcrel32@hi+24
-; GFX12-NEXT:    s_mov_b32 s3, s32
-; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX12-NEXT:    v_lshl_add_u32 v1, s2, 6, s3
-; GFX12-NEXT:    scratch_store_b32 off, v0, s3
-; GFX12-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX12-NEXT:    v_lshl_add_u32 v3, s4, 6, s2
+; GFX12-NEXT:    scratch_store_b32 off, v4, s2
+; GFX12-NEXT:    v_readfirstlane_b32 s32, v3
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -360,21 +395,27 @@ define amdgpu_cs_chain void @test_alloca_and_call_var(i32 %count) {
 ; GFX942-LABEL: test_alloca_and_call_var:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
-; GFX942-NEXT:    v_and_b32_e32 v1, -16, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_mov_b64 s[0:1], exec
-; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    v_lshl_add_u32 v1, v8, 2, 15
 ; GFX942-NEXT:    s_mov_b32 s33, s32
 ; GFX942-NEXT:    s_add_i32 s32, s32, 16
-; GFX942-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX942-NEXT:    v_readlane_b32 s4, v1, s3
-; GFX942-NEXT:    s_bitset0_b64 s[0:1], s3
-; GFX942-NEXT:    s_max_u32 s2, s2, s4
-; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX942-NEXT:    s_cbranch_scc1 .LBB6_1
-; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX942-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[0:1]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_max_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_max_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readlane_b32 s2, v0, 63
+; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    s_getpc_b64 s[0:1]
 ; GFX942-NEXT:    s_add_u32 s0, s0, foo at gotpcrel32@lo+4
 ; GFX942-NEXT:    s_addc_u32 s1, s1, foo at gotpcrel32@hi+12
@@ -382,8 +423,10 @@ define amdgpu_cs_chain void @test_alloca_and_call_var(i32 %count) {
 ; GFX942-NEXT:    s_mov_b32 s3, s32
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX942-NEXT:    v_lshl_add_u32 v1, s2, 6, v1
-; GFX942-NEXT:    scratch_store_dword off, v0, s3
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    scratch_store_dword off, v1, s3
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX942-NEXT:    s_endpgm
@@ -467,13 +510,13 @@ define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count)
 ; GFX12-NEXT:    s_mov_b32 s4, s32
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 6
-; GFX12-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-NEXT:    s_add_co_i32 s32, s4, s0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX12-NEXT:    scratch_store_b32 off, v40, s4
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    scratch_store_b32 off, v0, s4
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: test_call_and_alloca_var_uniform:
@@ -489,11 +532,11 @@ define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count)
 ; GFX942-NEXT:    s_addc_u32 s1, s1, foo at gotpcrel32@hi+12
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX942-NEXT:    s_mov_b32 s4, s32
-; GFX942-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX942-NEXT:    s_add_i32 s32, s4, s2
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX942-NEXT:    scratch_store_dword off, v40, s4
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    scratch_store_dword off, v0, s4
 ; GFX942-NEXT:    s_endpgm
   %v = alloca i32, i32 %count, align 4, addrspace(5)
   call amdgpu_gfx void @foo()
@@ -509,71 +552,93 @@ define amdgpu_cs_chain void @test_call_and_alloca_var(i32 %count) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
-; GFX12-NEXT:    v_mov_b32_e32 v40, 0
-; GFX12-NEXT:    s_mov_b64 s[0:1], exec
-; GFX12-NEXT:    s_mov_b32 s2, 0
+; GFX12-NEXT:    v_lshl_add_u32 v3, v8, 2, 15
 ; GFX12-NEXT:    s_mov_b32 s33, s32
-; GFX12-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX12-NEXT:    s_add_co_i32 s32, s32, 16
-; GFX12-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v3, -16, v3
+; GFX12-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, v3, s[0:1]
+; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v2, -1, 0
+; GFX12-NEXT:    s_getpc_b64 s[2:3]
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    v_readlane_b32 s4, v0, s3
-; GFX12-NEXT:    s_bitset0_b64 s[0:1], s3
-; GFX12-NEXT:    s_max_u32 s2, s2, s4
+; GFX12-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-NEXT:    s_add_co_u32 s2, s2, foo at gotpcrel32@lo+12
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX12-NEXT:    s_cbranch_scc1 .LBB9_1
-; GFX12-NEXT:  ; %bb.2:
-; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo at gotpcrel32@hi+24
+; GFX12-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX12-NEXT:    v_mbcnt_hi_u32_b32 v2, -1, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX12-NEXT:    v_add_nc_u32_e32 v2, 32, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX12-NEXT:    v_mul_lo_u32 v2, 4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX12-NEXT:    ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15)
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    v_max_u32_e32 v1, v0, v1
+; GFX12-NEXT:    ds_permute_b32 v1, v2, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    v_max_u32_e32 v0, v0, v1
+; GFX12-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s4, v0, 63
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_sext_i32_i16 s1, s1
-; GFX12-NEXT:    s_add_co_u32 s0, s0, foo at gotpcrel32@lo+12
+; GFX12-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX12-NEXT:    s_mov_b32 s5, s32
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, foo at gotpcrel32@hi+24
-; GFX12-NEXT:    s_mov_b32 s4, s32
-; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX12-NEXT:    v_lshl_add_u32 v0, s2, 6, s4
+; GFX12-NEXT:    v_lshl_add_u32 v3, s4, 6, s5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s32, v3
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX12-NEXT:    scratch_store_b32 off, v40, s4
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0
+; GFX12-NEXT:    scratch_store_b32 off, v3, s5
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: test_call_and_alloca_var:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
-; GFX942-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX942-NEXT:    v_mov_b32_e32 v40, 0
-; GFX942-NEXT:    s_mov_b64 s[0:1], exec
-; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    v_lshl_add_u32 v1, v8, 2, 15
 ; GFX942-NEXT:    s_mov_b32 s33, s32
 ; GFX942-NEXT:    s_add_i32 s32, s32, 16
-; GFX942-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX942-NEXT:    v_readlane_b32 s4, v0, s3
-; GFX942-NEXT:    s_bitset0_b64 s[0:1], s3
-; GFX942-NEXT:    s_max_u32 s2, s2, s4
-; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX942-NEXT:    s_cbranch_scc1 .LBB9_1
-; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX942-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[0:1]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_max_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_max_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readlane_b32 s2, v0, 63
+; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX942-NEXT:    s_getpc_b64 s[0:1]
 ; GFX942-NEXT:    s_add_u32 s0, s0, foo at gotpcrel32@lo+4
 ; GFX942-NEXT:    s_addc_u32 s1, s1, foo at gotpcrel32@hi+12
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX942-NEXT:    s_mov_b32 s4, s32
-; GFX942-NEXT:    v_mov_b32_e32 v0, s4
-; GFX942-NEXT:    v_lshl_add_u32 v0, s2, 6, v0
+; GFX942-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NEXT:    v_lshl_add_u32 v1, s2, 6, v1
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX942-NEXT:    v_readfirstlane_b32 s32, v1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX942-NEXT:    scratch_store_dword off, v40, s4
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    scratch_store_dword off, v1, s4
 ; GFX942-NEXT:    s_endpgm
   %v = alloca i32, i32 %count, align 4, addrspace(5)
   call amdgpu_gfx void @foo()
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index d19a260db3550..95c1dd4d35f86 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -248,20 +248,26 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() {
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
-; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
 ; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x400
-; GFX9-SDAG-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
-; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB3_1
-; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[4:5]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    v_readlane_b32 s6, v1, 63
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
@@ -273,54 +279,64 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() {
 ;
 ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent:
 ; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x400
 ; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
 ; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
 ; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s32
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
-; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x400
-; GFX9-GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
-; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
-; GFX9-GISEL-NEXT:  ; %bb.2:
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
-; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[4:5]
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    v_readlane_b32 s7, v1, 63
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s7, 6
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s6, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, 16
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
-; GFX11-SDAG-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB3_1
-; GFX11-SDAG-NEXT:  ; %bb.2:
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s1, v1, 31
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x7b
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v3, s0 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:    s_endpgm
@@ -328,28 +344,31 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() {
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
 ; GFX11-GISEL-NEXT:    s_mov_b32 s32, 16
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, s32
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX11-GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
-; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s2, v1, 31
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
-; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
-; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s2, 5
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s0, s1
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -366,24 +385,30 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned
 ; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
 ; GFX9-SDAG-NEXT:    s_and_b32 s6, s4, 0xffffe000
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
-; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
-; GFX9-SDAG-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s8, s[4:5]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s9, v0, s8
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s8
-; GFX9-SDAG-NEXT:    s_max_u32 s7, s7, s9
-; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB4_1
-; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[4:5]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    v_readlane_b32 s7, v1, 63
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s7, 6, v0
-; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
-; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v2, s7, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x1bc
+; GFX9-SDAG-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
@@ -391,85 +416,98 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
 ; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
 ; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x2000
-; GFX9-GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
-; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
-; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[4:5]
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    v_readlane_b32 s6, v1, 63
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0x1fff
 ; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xffffe000
 ; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s6, 6
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x1bc
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
 ; GFX11-SDAG-NEXT:    s_movk_i32 s32, 0x80
-; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-SDAG-NEXT:    s_add_i32 s0, s32, 0xfff
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, 0xfffff000
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
-; GFX11-SDAG-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v0, s3
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
-; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s4
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB4_1
-; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s1, v1, 31
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s32, 0xfff
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x1bc
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, 0xfffff000
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s0
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s0 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v3, s0 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
 ; GFX11-GISEL-NEXT:    s_movk_i32 s32, 0x80
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX11-GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
-; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s1, v1, 31
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x1bc
-; GFX11-GISEL-NEXT:    s_add_u32 s1, s32, 0xfff
-; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX11-GISEL-NEXT:    s_add_u32 s0, s32, 0xfff
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s1, 5
+; GFX11-GISEL-NEXT:    s_and_b32 s0, s0, 0xfffff000
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
-; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s0, s1
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -482,20 +520,26 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligne
 ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
-; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
 ; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x400
-; GFX9-SDAG-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
-; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB5_1
-; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX9-SDAG-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[4:5]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    v_readlane_b32 s6, v1, 63
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
@@ -507,52 +551,63 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligne
 ;
 ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
 ; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x400
 ; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
 ; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 4, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
 ; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s32
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
-; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x400
-; GFX9-GISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
-; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB5_1
-; GFX9-GISEL-NEXT:  ; %bb.2:
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
-; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[4:5]
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    v_readlane_b32 s7, v1, 63
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s7, 6
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s6, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, 16
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX11-SDAG-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB5_1
-; GFX11-SDAG-NEXT:  ; %bb.2:
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x29a
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s1, v1, 31
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x29a
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v3, s0 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:    s_endpgm
@@ -560,28 +615,31 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligne
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
 ; GFX11-GISEL-NEXT:    s_mov_b32 s32, 16
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, s32
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 4, 15
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX11-GISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB5_1
-; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s2, v1, 31
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
-; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
-; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s2, 5
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s0, s1
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -596,42 +654,48 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
 ; GFX9-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
 ; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x2000
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x2000
-; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB6_2
 ; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb.0
 ; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s5, s5, 15
 ; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
 ; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, -16
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-SDAG-NEXT:    s_and_b32 s9, s6, 0xfffff000
+; GFX9-SDAG-NEXT:    s_and_b32 s8, s6, 0xfffff000
 ; GFX9-SDAG-NEXT:    s_lshl_b32 s5, s5, 6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s8, s5
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
-; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-SDAG-NEXT:    s_add_i32 s32, s9, s5
-; GFX9-SDAG-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s5, s[6:7]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s10, v0, s5
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s5
-; GFX9-SDAG-NEXT:    s_max_u32 s8, s8, s10
-; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB6_2
-; GFX9-SDAG-NEXT:  ; %bb.3:
-; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s8, 6, v0
+; GFX9-SDAG-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[6:7]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s5, 6, v0
 ; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 3
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 4
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s5
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s6
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:  .LBB6_4: ; %bb.1
+; GFX9-SDAG-NEXT:  .LBB6_2: ; %bb.1
 ; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s4, s4, 15
 ; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, -16
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 1
@@ -650,42 +714,48 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
 ; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x2000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x2000
-; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB6_2
 ; GFX9-GISEL-NEXT:  ; %bb.1: ; %bb.0
 ; GFX9-GISEL-NEXT:    s_lshl2_add_u32 s5, s5, 15
 ; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, -16
-; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s5, 6
-; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0xfff
-; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xfffff000
+; GFX9-GISEL-NEXT:    s_add_u32 s6, s32, 0xfff
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s5, 6
+; GFX9-GISEL-NEXT:    s_and_b32 s8, s6, 0xfffff000
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s8, s5
 ; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s6
+; GFX9-GISEL-NEXT:    s_mov_b32 s5, s32
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-GISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX9-GISEL-NEXT:    v_readlane_b32 s10, v0, s9
-; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX9-GISEL-NEXT:    s_max_u32 s8, s8, s10
-; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB6_2
-; GFX9-GISEL-NEXT:  ; %bb.3:
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, s32
+; GFX9-GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[6:7]
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    v_readlane_b32 s9, v1, 63
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 3
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_lshl_b32 s7, s8, 6
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s9, 6
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-GISEL-NEXT:    s_add_u32 s32, s6, s7
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s6
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:  .LBB6_4: ; %bb.1
+; GFX9-GISEL-NEXT:  .LBB6_2: ; %bb.1
 ; GFX9-GISEL-NEXT:    s_lshl2_add_u32 s4, s4, 15
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, s32
 ; GFX9-GISEL-NEXT:    s_and_b32 s4, s4, -16
@@ -694,60 +764,66 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas:
 ; GFX11-SDAG:       ; %bb.0: ; %entry
 ; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-SDAG-NEXT:    s_mov_b32 s2, 0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
 ; GFX11-SDAG-NEXT:    s_movk_i32 s32, 0x80
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB6_2
 ; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.0
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT:    s_lshl2_add_u32 s1, s1, 15
-; GFX11-SDAG-NEXT:    s_add_i32 s3, s32, 0x7ff
-; GFX11-SDAG-NEXT:    s_and_b32 s4, s1, -16
-; GFX11-SDAG-NEXT:    s_and_b32 s1, s3, 0xfffff800
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX11-SDAG-NEXT:    s_lshl_b32 s3, s4, 5
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s3
-; GFX11-SDAG-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
-; GFX11-SDAG-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX11-SDAG-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s5, v0, s4
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s3, s4
-; GFX11-SDAG-NEXT:    s_max_u32 s2, s2, s5
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB6_2
-; GFX11-SDAG-NEXT:  ; %bb.3:
-; GFX11-SDAG-NEXT:    s_mov_b32 s3, s32
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v2, 4
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s2, 5, s3
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s2
+; GFX11-SDAG-NEXT:    s_lshl2_add_u32 s1, s1, 15
+; GFX11-SDAG-NEXT:    s_add_i32 s3, s32, 0x7ff
+; GFX11-SDAG-NEXT:    s_and_b32 s1, s1, -16
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v1, 31
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-SDAG-NEXT:    s_and_b32 s2, s3, 0xfffff800
+; GFX11-SDAG-NEXT:    s_lshl_b32 s1, s1, 5
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v3, s2 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s3 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v4, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s4, 5, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:  .LBB6_4: ; %bb.1
+; GFX11-SDAG-NEXT:  .LBB6_2: ; %bb.1
 ; GFX11-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, 15
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v3, 2
 ; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s33 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v3, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
 ; GFX11-SDAG-NEXT:    s_endpgm
@@ -755,50 +831,55 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    s_mov_b32 s2, 0
 ; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
 ; GFX11-GISEL-NEXT:    s_movk_i32 s32, 0x80
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB6_2
 ; GFX11-GISEL-NEXT:  ; %bb.1: ; %bb.0
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_lshl2_add_u32 s1, s1, 15
-; GFX11-GISEL-NEXT:    s_add_u32 s3, s32, 0x7ff
-; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, -16
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    s_lshl_b32 s4, s1, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX11-GISEL-NEXT:    s_and_b32 s1, s3, 0xfffff800
-; GFX11-GISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s4
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX11-GISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
-; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX11-GISEL-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_readlane_b32 s5, v0, s4
-; GFX11-GISEL-NEXT:    s_bitset0_b32 s3, s4
-; GFX11-GISEL-NEXT:    s_max_u32 s2, s2, s5
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB6_2
-; GFX11-GISEL-NEXT:  ; %bb.3:
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
-; GFX11-GISEL-NEXT:    s_mov_b32 s3, s32
-; GFX11-GISEL-NEXT:    s_lshl_b32 s2, s2, 5
-; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s2
+; GFX11-GISEL-NEXT:    s_lshl2_add_u32 s1, s1, 15
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-GISEL-NEXT:    s_add_u32 s2, s32, 0x7ff
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, -16
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v3, 4
+; GFX11-GISEL-NEXT:    s_and_b32 s2, s2, 0xfffff800
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s1, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s2, s1
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s2 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s3 dlc
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s2, s3, 5
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v3, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT:    s_add_u32 s32, s3, s2
-; GFX11-GISEL-NEXT:  .LBB6_4: ; %bb.1
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s2
+; GFX11-GISEL-NEXT:  .LBB6_2: ; %bb.1
 ; GFX11-GISEL-NEXT:    s_lshl2_add_u32 s0, s0, 15
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v3, 2
 ; GFX11-GISEL-NEXT:    s_and_b32 s0, s0, -16
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
 ; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s33 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v3, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -830,20 +911,26 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
 ; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x1000
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
-; GFX9-SDAG-NEXT:    s_cbranch_scc0 .LBB7_6
+; GFX9-SDAG-NEXT:    s_cbranch_scc0 .LBB7_4
 ; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb.1
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
-; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-SDAG-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s8, s[6:7]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s9, v0, s8
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s8
-; GFX9-SDAG-NEXT:    s_max_u32 s4, s4, s9
-; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB7_2
-; GFX9-SDAG-NEXT:  ; %bb.3:
+; GFX9-SDAG-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[6:7]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-SDAG-NEXT:    s_mov_b32 s6, s32
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s4, 6, v0
@@ -851,71 +938,77 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s6
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    s_cbranch_execnz .LBB7_5
-; GFX9-SDAG-NEXT:  .LBB7_4: ; %bb.0
+; GFX9-SDAG-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX9-SDAG-NEXT:  .LBB7_2: ; %bb.0
 ; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0xfff
 ; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s5, s5, 15
 ; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, 0xfffff000
 ; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, -16
 ; GFX9-SDAG-NEXT:    s_lshl_b32 s5, s5, 6
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-SDAG-NEXT:    s_add_i32 s32, s4, s5
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:  .LBB7_5: ; %bb.2
+; GFX9-SDAG-NEXT:  .LBB7_3: ; %bb.2
 ; GFX9-SDAG-NEXT:    s_endpgm
-; GFX9-SDAG-NEXT:  .LBB7_6:
-; GFX9-SDAG-NEXT:    s_branch .LBB7_4
+; GFX9-SDAG-NEXT:  .LBB7_4:
+; GFX9-SDAG-NEXT:    s_branch .LBB7_2
 ;
 ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_control_flow:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
 ; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x1000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-GISEL-NEXT:    s_mov_b32 s4, 1
-; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x1000
-; GFX9-GISEL-NEXT:    s_cbranch_scc0 .LBB7_4
+; GFX9-GISEL-NEXT:    s_cbranch_scc0 .LBB7_2
 ; GFX9-GISEL-NEXT:  ; %bb.1: ; %bb.1
 ; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, s32
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-GISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s4, s[6:7]
-; GFX9-GISEL-NEXT:    v_readlane_b32 s9, v0, s4
-; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s4
-; GFX9-GISEL-NEXT:    s_max_u32 s8, s8, s9
-; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB7_2
-; GFX9-GISEL-NEXT:  ; %bb.3:
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
-; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s8, 6
+; GFX9-GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[6:7]
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX9-GISEL-NEXT:    s_nop 0
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    v_readlane_b32 s9, v1, 63
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s9, 6
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s6
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s8, s6
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:  .LBB7_4: ; %Flow
+; GFX9-GISEL-NEXT:  .LBB7_2: ; %Flow
 ; GFX9-GISEL-NEXT:    s_xor_b32 s4, s4, 1
 ; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB7_6
-; GFX9-GISEL-NEXT:  ; %bb.5: ; %bb.0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB7_4
+; GFX9-GISEL-NEXT:  ; %bb.3: ; %bb.0
 ; GFX9-GISEL-NEXT:    s_lshl2_add_u32 s4, s5, 15
 ; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0xfff
 ; GFX9-GISEL-NEXT:    s_and_b32 s4, s4, -16
 ; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xfffff000
 ; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s4, 6
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:  .LBB7_6: ; %bb.2
+; GFX9-GISEL-NEXT:  .LBB7_4: ; %bb.2
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_control_flow:
@@ -925,31 +1018,35 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, 64
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
-; GFX11-SDAG-NEXT:    s_cbranch_scc0 .LBB7_6
+; GFX11-SDAG-NEXT:    s_cbranch_scc0 .LBB7_4
 ; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.1
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
-; GFX11-SDAG-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v0, s3
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
-; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s4
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB7_2
-; GFX11-SDAG-NEXT:  ; %bb.3:
-; GFX11-SDAG-NEXT:    s_mov_b32 s2, s32
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 1
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s2
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s2 dlc
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s2, v1, 31
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s2, 5, s3
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v3, s3 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:    s_cbranch_execnz .LBB7_5
-; GFX11-SDAG-NEXT:  .LBB7_4: ; %bb.0
+; GFX11-SDAG-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX11-SDAG-NEXT:  .LBB7_2: ; %bb.0
 ; GFX11-SDAG-NEXT:    s_lshl2_add_u32 s1, s1, 15
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX11-SDAG-NEXT:    s_add_i32 s0, s32, 0x7ff
@@ -959,50 +1056,53 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s0, s1
-; GFX11-SDAG-NEXT:  .LBB7_5: ; %bb.2
+; GFX11-SDAG-NEXT:  .LBB7_3: ; %bb.2
 ; GFX11-SDAG-NEXT:    s_endpgm
-; GFX11-SDAG-NEXT:  .LBB7_6:
-; GFX11-SDAG-NEXT:    s_branch .LBB7_4
+; GFX11-SDAG-NEXT:  .LBB7_4:
+; GFX11-SDAG-NEXT:    s_branch .LBB7_2
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_control_flow:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    s_mov_b32 s2, 0
 ; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
 ; GFX11-GISEL-NEXT:    s_mov_b32 s32, 64
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX11-GISEL-NEXT:    s_mov_b32 s0, 1
-; GFX11-GISEL-NEXT:    s_cbranch_scc0 .LBB7_4
+; GFX11-GISEL-NEXT:    s_cbranch_scc0 .LBB7_2
 ; GFX11-GISEL-NEXT:  ; %bb.1: ; %bb.1
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, s32
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX11-GISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
-; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s3, s0
+; GFX11-GISEL-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_readlane_b32 s4, v0, s3
-; GFX11-GISEL-NEXT:    s_bitset0_b32 s0, s3
-; GFX11-GISEL-NEXT:    s_max_u32 s2, s2, s4
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB7_2
-; GFX11-GISEL-NEXT:  ; %bb.3:
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 1
-; GFX11-GISEL-NEXT:    s_mov_b32 s3, s32
-; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s2, 5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_add_u32 s32, s3, s0
+; GFX11-GISEL-NEXT:    s_lshl_b32 s3, s3, 5
 ; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s3 dlc
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s2, s3
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s2 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT:  .LBB7_4: ; %Flow
+; GFX11-GISEL-NEXT:  .LBB7_2: ; %Flow
 ; GFX11-GISEL-NEXT:    s_xor_b32 s0, s0, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB7_6
-; GFX11-GISEL-NEXT:  ; %bb.5: ; %bb.0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB7_4
+; GFX11-GISEL-NEXT:  ; %bb.3: ; %bb.0
 ; GFX11-GISEL-NEXT:    s_lshl2_add_u32 s0, s1, 15
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX11-GISEL-NEXT:    s_add_u32 s1, s32, 0x7ff
@@ -1012,7 +1112,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
 ; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
-; GFX11-GISEL-NEXT:  .LBB7_6: ; %bb.2
+; GFX11-GISEL-NEXT:  .LBB7_4: ; %bb.2
 ; GFX11-GISEL-NEXT:    s_endpgm
 entry:
   %cond = icmp eq i32 %n, 0
@@ -1034,21 +1134,30 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
 ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
-; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, s33
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-SDAG-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
-; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB8_1
-; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[4:5]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    v_readlane_b32 s6, v1, 63
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
@@ -1057,95 +1166,143 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, s33
-; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s7
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, s33
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
-; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
-; GFX9-GISEL-NEXT:  ; %bb.2:
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
-; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
-; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s32
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[4:5]
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    v_readlane_b32 s7, v1, 63
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s7, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s6, s4
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_mov_b32 s32, s33
-; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s8
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_uniform:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, s33
 ; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-SDAG-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s33
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s33 offset:4
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-SDAG-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB8_1
-; GFX11-SDAG-NEXT:  ; %bb.2:
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s1, v1, 31
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x7b
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v3, s0 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s33
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-SDAG-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
+; GFX11-SDAG-NEXT:    scratch_load_b32 v1, off, s33
+; GFX11-SDAG-NEXT:    scratch_load_b32 v2, off, s33 offset:4
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, s33
 ; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s33
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v2, s33 offset:4
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, s32
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
-; GFX11-GISEL-NEXT:  ; %bb.2:
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
-; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
-; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
-; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s2, v1, 31
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s2, 5
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s0, s1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s32, s33
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s0 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
+; GFX11-GISEL-NEXT:    scratch_load_b32 v1, off, s33
+; GFX11-GISEL-NEXT:    scratch_load_b32 v2, off, s33 offset:4
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s3
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i32 %n, addrspace(5)
   store volatile i32 123, ptr addrspace(5) %alloca
@@ -1156,138 +1313,198 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s33
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s33
 ; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0x1fc0
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s34
+; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s34
 ; GFX9-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x4000
 ; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-SDAG-NEXT:    s_and_b32 s6, s4, 0xffffe000
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0
-; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffe000
-; GFX9-SDAG-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s8, s[4:5]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s9, v0, s8
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s8
-; GFX9-SDAG-NEXT:    s_max_u32 s7, s7, s9
-; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB9_1
-; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[4:5]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    v_readlane_b32 s7, v1, 63
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s7, 6, v0
-; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 10
-; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v2, s7, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 10
+; GFX9-SDAG-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, s34
-; GFX9-SDAG-NEXT:    s_mov_b32 s34, s11
-; GFX9-SDAG-NEXT:    s_mov_b32 s33, s10
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s9
+; GFX9-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s8
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    s_mov_b32 s7, s33
 ; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0x1fc0
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s34
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, s34
 ; GFX9-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x4000
-; GFX9-GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
-; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB9_1
-; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[4:5]
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    v_readlane_b32 s6, v1, 63
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0x1fff
 ; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s6, 6
 ; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xffffe000
 ; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 10
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_mov_b32 s32, s34
-; GFX9-GISEL-NEXT:    s_mov_b32 s34, s10
-; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s8
+; GFX9-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s7
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX11-SDAG-NEXT:    s_mov_b32 s5, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, s33
 ; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 0x7f
-; GFX11-SDAG-NEXT:    s_mov_b32 s6, s34
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-SDAG-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-SDAG-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s33
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s33 offset:4
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, s34
 ; GFX11-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0x100
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s1, v1, 31
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-SDAG-NEXT:    s_add_i32 s0, s32, 0xfff
-; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 10
 ; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, 0xfffff000
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
-; GFX11-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffff80
-; GFX11-SDAG-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v0, s3
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
-; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s4
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB9_1
-; GFX11-SDAG-NEXT:  ; %bb.2:
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s0
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 10
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s5
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s0 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v3, s0 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s34
-; GFX11-SDAG-NEXT:    s_mov_b32 s34, s6
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s3
+; GFX11-SDAG-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-SDAG-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
+; GFX11-SDAG-NEXT:    scratch_load_b32 v1, off, s33
+; GFX11-SDAG-NEXT:    scratch_load_b32 v2, off, s33 offset:4
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, s33
 ; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 0x7f
-; GFX11-GISEL-NEXT:    s_mov_b32 s5, s34
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s33
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v2, s33 offset:4
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, s34
 ; GFX11-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0x100
-; GFX11-GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB9_1
-; GFX11-GISEL-NEXT:  ; %bb.2:
-; GFX11-GISEL-NEXT:    s_add_u32 s1, s32, 0xfff
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s1, v1, 31
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT:    s_add_u32 s0, s32, 0xfff
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 10
-; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, 0xfffff000
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
-; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
-; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_and_b32 s0, s0, 0xfffff000
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s1, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s0, s1
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_mov_b32 s32, s34
-; GFX11-GISEL-NEXT:    s_mov_b32 s34, s5
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s3
+; GFX11-GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
+; GFX11-GISEL-NEXT:    scratch_load_b32 v1, off, s33
+; GFX11-GISEL-NEXT:    scratch_load_b32 v2, off, s33 offset:4
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s2
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i32 %n, align 128, addrspace(5)
   store volatile i32 10, ptr addrspace(5) %alloca
@@ -1298,21 +1515,30 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
 ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
-; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, s33
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-SDAG-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
-; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB10_1
-; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[4:5]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    v_readlane_b32 s6, v1, 63
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
@@ -1321,95 +1547,143 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, s33
-; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s7
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, s33
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
-; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
-; GFX9-GISEL-NEXT:  ; %bb.2:
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
-; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
-; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s32
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[4:5]
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    v_readlane_b32 s7, v1, 63
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s7, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s6, s4
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 22
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_mov_b32 s32, s33
-; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s8
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, s33
 ; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-SDAG-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s33
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s33 offset:4
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-SDAG-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB10_1
-; GFX11-SDAG-NEXT:  ; %bb.2:
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 22
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s1, v1, 31
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 22
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v3, s0 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s33
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-SDAG-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
+; GFX11-SDAG-NEXT:    scratch_load_b32 v1, off, s33
+; GFX11-SDAG-NEXT:    scratch_load_b32 v2, off, s33 offset:4
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, s33
 ; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s33
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v2, s33 offset:4
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, s32
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
-; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s2, v1, 31
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 22
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
-; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
-; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
-; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s2, 5
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s0, s1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s32, s33
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s0 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
+; GFX11-GISEL-NEXT:    scratch_load_b32 v1, off, s33
+; GFX11-GISEL-NEXT:    scratch_load_b32 v2, off, s33 offset:4
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s3
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i32 %n, align 2, addrspace(5)
   store volatile i32 22, ptr addrspace(5) %alloca
@@ -1420,124 +1694,180 @@ define void @test_dynamic_stackalloc_device_divergent() {
 ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
-; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
-; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, s33
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-SDAG-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
-; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB11_1
-; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, 0x1ff0, v1
+; GFX9-SDAG-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    v_readlane_b32 s6, v0, 63
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s6, 6, v1
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, s33
-; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s7
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
-; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, s33
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-GISEL-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
-; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB11_1
-; GFX9-GISEL-NEXT:  ; %bb.2:
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
-; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
-; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s32
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX9-GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    v_readlane_b32 s7, v0, 63
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s7, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s6, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-GISEL-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_mov_b32 s32, s33
-; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s8
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
-; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, s33
 ; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-SDAG-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s33
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s33 offset:4
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
-; GFX11-SDAG-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v2, v2, 2, 15
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, 0x1ff0, v2
+; GFX11-SDAG-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB11_1
-; GFX11-SDAG-NEXT:  ; %bb.2:
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15)
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_max_u32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s1, v0, 31
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x7b
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v2, s1, 5, s0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v3, s0 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v2
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s33
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-SDAG-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
+; GFX11-SDAG-NEXT:    scratch_load_b32 v0, off, s33
+; GFX11-SDAG-NEXT:    scratch_load_b32 v1, off, s33 offset:4
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
-; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, s33
 ; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
-; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s33
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s33 offset:4
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
 ; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, s32
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v2, v2, 2, 15
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX11-GISEL-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, -16, v2
+; GFX11-GISEL-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB11_1
-; GFX11-GISEL-NEXT:  ; %bb.2:
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
-; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
-; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
-; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15)
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_max_u32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s2, v0, 31
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s2, 5
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s0, s1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s32, s33
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v2, s0 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
+; GFX11-GISEL-NEXT:    scratch_load_b32 v0, off, s33
+; GFX11-GISEL-NEXT:    scratch_load_b32 v1, off, s33 offset:4
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s3
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca i32, i32 %idx, addrspace(5)
@@ -1549,142 +1879,202 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
 ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s33
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s33
 ; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0x1fc0
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s34
+; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s34
 ; GFX9-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x4000
-; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
 ; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
 ; GFX9-SDAG-NEXT:    s_and_b32 s6, s4, 0xffffe000
-; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
-; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0
-; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffe000
-; GFX9-SDAG-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s8, s[4:5]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s9, v0, s8
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s8
-; GFX9-SDAG-NEXT:    s_max_u32 s7, s7, s9
-; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB12_1
-; GFX9-SDAG-NEXT:  ; %bb.2:
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s7, 6, v0
-; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
-; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, 0x1ff0, v1
+; GFX9-SDAG-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    v_readlane_b32 s7, v0, 63
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v2, s7, 6, v1
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x1bc
+; GFX9-SDAG-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, s34
-; GFX9-SDAG-NEXT:    s_mov_b32 s34, s11
-; GFX9-SDAG-NEXT:    s_mov_b32 s33, s10
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s9
+; GFX9-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s8
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
-; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    s_mov_b32 s7, s33
 ; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0x1fc0
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s34
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, s34
 ; GFX9-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x4000
-; GFX9-GISEL-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
-; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB12_1
-; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX9-GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    v_readlane_b32 s6, v0, 63
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0x1fff
 ; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s6, 6
 ; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xffffe000
 ; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x1bc
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1bc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_mov_b32 s32, s34
-; GFX9-GISEL-NEXT:    s_mov_b32 s34, s10
-; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s8
+; GFX9-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s7
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
-; GFX11-SDAG-NEXT:    s_mov_b32 s5, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, s33
 ; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 0x7f
-; GFX11-SDAG-NEXT:    s_mov_b32 s6, s34
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-SDAG-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-SDAG-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s33
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s33 offset:4
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, s34
 ; GFX11-SDAG-NEXT:    s_mov_b32 s34, s32
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0x100
-; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v2, v2, 2, 15
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, 0x1ff0, v2
+; GFX11-SDAG-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15)
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_max_u32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s1, v0, 31
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-SDAG-NEXT:    s_add_i32 s0, s32, 0xfff
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x1bc
 ; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, 0xfffff000
-; GFX11-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffff80
-; GFX11-SDAG-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v0, s3
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
-; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s4
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB12_1
-; GFX11-SDAG-NEXT:  ; %bb.2:
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s0
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s5
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s0 dlc
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v2, s1, 5, s0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v3, s0 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v2
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s34
-; GFX11-SDAG-NEXT:    s_mov_b32 s34, s6
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s3
+; GFX11-SDAG-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-SDAG-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
+; GFX11-SDAG-NEXT:    scratch_load_b32 v0, off, s33
+; GFX11-SDAG-NEXT:    scratch_load_b32 v1, off, s33 offset:4
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
-; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, s33
 ; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 0x7f
-; GFX11-GISEL-NEXT:    s_mov_b32 s5, s34
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s33
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s33 offset:4
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, s34
 ; GFX11-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0x100
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX11-GISEL-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v2, v2, 2, 15
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, -16, v2
+; GFX11-GISEL-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB12_1
-; GFX11-GISEL-NEXT:  ; %bb.2:
-; GFX11-GISEL-NEXT:    s_add_u32 s1, s32, 0xfff
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x1bc
-; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, 0xfffff000
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
-; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
-; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15)
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_max_u32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s1, v0, 31
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT:    s_add_u32 s0, s32, 0xfff
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x1bc
+; GFX11-GISEL-NEXT:    s_and_b32 s0, s0, 0xfffff000
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s1, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s0, s1
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v2, s0 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_mov_b32 s32, s34
-; GFX11-GISEL-NEXT:    s_mov_b32 s34, s5
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s3
+; GFX11-GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
+; GFX11-GISEL-NEXT:    scratch_load_b32 v0, off, s33
+; GFX11-GISEL-NEXT:    scratch_load_b32 v1, off, s33 offset:4
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s2
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca i32, i32 %idx, align 128, addrspace(5)
@@ -1696,124 +2086,180 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
 ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
-; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
-; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, s33
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-SDAG-NEXT:  .LBB13_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
-; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB13_1
-; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, 0x1ff0, v1
+; GFX9-SDAG-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    v_readlane_b32 s6, v0, 63
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x29a
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s6, 6, v1
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x29a
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, s33
-; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s7
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
-; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, s33
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-GISEL-NEXT:  .LBB13_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
-; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB13_1
-; GFX9-GISEL-NEXT:  ; %bb.2:
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
-; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
-; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s32
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX9-GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    v_readlane_b32 s7, v0, 63
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s7, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s6, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x29a
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-GISEL-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_mov_b32 s32, s33
-; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s8
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
-; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, s33
 ; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-SDAG-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s33
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s33 offset:4
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
-; GFX11-SDAG-NEXT:  .LBB13_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v2, v2, 2, 15
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, 0x1ff0, v2
+; GFX11-SDAG-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB13_1
-; GFX11-SDAG-NEXT:  ; %bb.2:
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x29a
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15)
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_max_u32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s1, v0, 31
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x29a
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v2, s1, 5, s0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v3, s0 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v2
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s33
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-SDAG-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
+; GFX11-SDAG-NEXT:    scratch_load_b32 v0, off, s33
+; GFX11-SDAG-NEXT:    scratch_load_b32 v1, off, s33 offset:4
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
-; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, s33
 ; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
-; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s33
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s33 offset:4
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
 ; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, s32
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v2, v2, 2, 15
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX11-GISEL-NEXT:  .LBB13_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, -16, v2
+; GFX11-GISEL-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB13_1
-; GFX11-GISEL-NEXT:  ; %bb.2:
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
-; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
-; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
-; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15)
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_max_u32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s2, v0, 31
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x29a
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s2, 5
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s0, s1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s32, s33
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v2, s0 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
+; GFX11-GISEL-NEXT:    scratch_load_b32 v0, off, s33
+; GFX11-GISEL-NEXT:    scratch_load_b32 v1, off, s33 offset:4
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s3
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca i32, i32 %idx, align 2, addrspace(5)
@@ -1825,74 +2271,94 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s13, s33
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s33
 ; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0xfc0
-; GFX9-SDAG-NEXT:    s_mov_b32 s14, s34
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s34
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x3000
 ; GFX9-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB14_6
+; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb.0
 ; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
-; GFX9-SDAG-NEXT:    s_and_b32 s9, s6, 0xfffff000
+; GFX9-SDAG-NEXT:    s_and_b32 s8, s6, 0xfffff000
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
-; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, 0
-; GFX9-SDAG-NEXT:  .LBB14_2: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s11, s[6:7]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s12, v1, s11
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s11
-; GFX9-SDAG-NEXT:    s_max_u32 s10, s10, s12
-; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB14_2
-; GFX9-SDAG-NEXT:  ; %bb.3:
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s10, 6, v1
-; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
-; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
-; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, 0x1ff0, v1
-; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, 0
-; GFX9-SDAG-NEXT:  .LBB14_4: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s11, s[6:7]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s12, v1, s11
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s11
-; GFX9-SDAG-NEXT:    s_max_u32 s10, s10, s12
-; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB14_4
-; GFX9-SDAG-NEXT:  ; %bb.5:
+; GFX9-SDAG-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v1, s[6:7]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    v_readlane_b32 s9, v2, 63
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s8
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v3, s9, 6, v1
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v3
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v3, 0x3ff, v31
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v3, v3, 2, 15
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v3, 0x1ff0, v3
+; GFX9-SDAG-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[6:7]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v2, 63
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-SDAG-NEXT:    s_mov_b32 s6, s32
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s10, 6, v1
-; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 3
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-SDAG-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v3, s8, 6, v3
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX9-SDAG-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX9-SDAG-NEXT:    buffer_store_dword v1, off, s[0:3], s6
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:  .LBB14_6: ; %bb.1
+; GFX9-SDAG-NEXT:  .LBB14_2: ; %bb.1
 ; GFX9-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-SDAG-NEXT:  .LBB14_7: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s6, s[4:5]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s7, v0, s6
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s6
-; GFX9-SDAG-NEXT:    s_max_u32 s8, s8, s7
-; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB14_7
-; GFX9-SDAG-NEXT:  ; %bb.8:
+; GFX9-SDAG-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[4:5]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    v_readlane_b32 s6, v2, 63
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s8, 6, v0
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
 ; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s33
@@ -1901,255 +2367,339 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, s34
-; GFX9-SDAG-NEXT:    s_mov_b32 s34, s14
-; GFX9-SDAG-NEXT:    s_mov_b32 s33, s13
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s11
+; GFX9-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s10
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s13, s33
+; GFX9-GISEL-NEXT:    s_mov_b32 s11, s33
 ; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0xfc0
-; GFX9-GISEL-NEXT:    s_mov_b32 s14, s34
-; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_mov_b32 s12, s34
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x3000
 ; GFX9-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-GISEL-NEXT:    s_cbranch_execz .LBB14_6
+; GFX9-GISEL-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX9-GISEL-NEXT:  ; %bb.1: ; %bb.0
 ; GFX9-GISEL-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0x3ff, v31
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, -16, v1
-; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-GISEL-NEXT:    s_mov_b32 s9, 0
-; GFX9-GISEL-NEXT:  .LBB14_2: ; =>This Inner Loop Header: Depth=1
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
-; GFX9-GISEL-NEXT:    v_readlane_b32 s11, v1, s10
-; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
-; GFX9-GISEL-NEXT:    s_max_u32 s9, s9, s11
-; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB14_2
-; GFX9-GISEL-NEXT:  ; %bb.3:
+; GFX9-GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v1, s[6:7]
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v2, 63
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-GISEL-NEXT:    s_add_u32 s7, s32, 0xfff
-; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s9, 6
-; GFX9-GISEL-NEXT:    s_and_b32 s9, s7, 0xfffff000
-; GFX9-GISEL-NEXT:    v_lshl_add_u32 v1, v2, 2, 15
-; GFX9-GISEL-NEXT:    s_add_u32 s32, s9, s6
+; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s8, 6
+; GFX9-GISEL-NEXT:    s_and_b32 s8, s7, 0xfffff000
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s8, s6
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v1, v3, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s32
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, -16, v1
-; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, 0
-; GFX9-GISEL-NEXT:  .LBB14_4: ; =>This Inner Loop Header: Depth=1
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s11, s[6:7]
-; GFX9-GISEL-NEXT:    v_readlane_b32 s12, v1, s11
-; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s11
-; GFX9-GISEL-NEXT:    s_max_u32 s10, s10, s12
-; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB14_4
-; GFX9-GISEL-NEXT:  ; %bb.5:
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, s32
+; GFX9-GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v1, s[6:7]
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    v_readlane_b32 s10, v2, 63
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 3
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-GISEL-NEXT:    s_lshl_b32 s7, s10, 6
-; GFX9-GISEL-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s10, 6
+; GFX9-GISEL-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-GISEL-NEXT:    s_add_u32 s32, s6, s7
-; GFX9-GISEL-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s9, s6
+; GFX9-GISEL-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:  .LBB14_6: ; %bb.1
+; GFX9-GISEL-NEXT:  .LBB14_2: ; %bb.1
 ; GFX9-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s32
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-GISEL-NEXT:  .LBB14_7: ; =>This Inner Loop Header: Depth=1
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s6, s[4:5]
-; GFX9-GISEL-NEXT:    v_readlane_b32 s7, v0, s6
-; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s6
-; GFX9-GISEL-NEXT:    s_max_u32 s8, s8, s7
-; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB14_7
-; GFX9-GISEL-NEXT:  ; %bb.8:
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
-; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s8, 6
+; GFX9-GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[4:5]
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    v_readlane_b32 s7, v2, 63
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s7, 6
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 1
-; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s6, s4
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_mov_b32 s32, s34
-; GFX9-GISEL-NEXT:    s_mov_b32 s34, s14
-; GFX9-GISEL-NEXT:    s_mov_b32 s33, s13
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s12
+; GFX9-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s11
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
 ; GFX11-SDAG:       ; %bb.0: ; %entry
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_mov_b32 s7, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s5, s33
 ; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 63
-; GFX11-SDAG-NEXT:    s_mov_b32 s8, s34
-; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-SDAG-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-SDAG-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s33 offset:64
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v3, s33 offset:68
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v4, s33 offset:72
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v5, s33 offset:76
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s6, s34
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xc0
 ; GFX11-SDAG-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB14_6
+; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.0
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
-; GFX11-SDAG-NEXT:    s_add_i32 s2, s32, 0x7ff
-; GFX11-SDAG-NEXT:    s_mov_b32 s4, exec_lo
-; GFX11-SDAG-NEXT:    s_and_b32 s2, s2, 0xfffff800
-; GFX11-SDAG-NEXT:    s_mov_b32 s3, 0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v6, 0x3ff, v31
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
-; GFX11-SDAG-NEXT:  .LBB14_2: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s5, s4
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s6, v1, s5
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s4, s5
-; GFX11-SDAG-NEXT:    s_max_u32 s3, s3, s6
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB14_2
-; GFX11-SDAG-NEXT:  ; %bb.3:
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v2, s3, 5, s2
-; GFX11-SDAG-NEXT:    s_mov_b32 s4, exec_lo
-; GFX11-SDAG-NEXT:    s_mov_b32 s3, 0
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v6, v6, 2, 15
+; GFX11-SDAG-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v1, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0x1ff0, v6
+; GFX11-SDAG-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, v1, s1
+; GFX11-SDAG-NEXT:    s_add_i32 s3, s32, 0x7ff
+; GFX11-SDAG-NEXT:    s_and_b32 s3, s3, 0xfffff800
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v2
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    ds_swizzle_b32 v4, v2 offset:swizzle(BROADCAST,32,15)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    ds_swizzle_b32 v5, v3 offset:swizzle(BROADCAST,32,15)
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX11-SDAG-NEXT:    v_max_u32_e32 v2, v2, v4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s2, v2, 31
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_max_u32_e32 v2, v3, v5
+; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v2, 31
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, s2, 5, s3
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0x1ff0, v1
-; GFX11-SDAG-NEXT:  .LBB14_4: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s5, s4
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s6, v1, s5
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s4, s5
-; GFX11-SDAG-NEXT:    s_max_u32 s3, s3, s6
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB14_4
-; GFX11-SDAG-NEXT:  ; %bb.5:
-; GFX11-SDAG-NEXT:    s_mov_b32 s4, s32
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, s3, 5, s4
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s2 dlc
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v6, s3 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v3, s4 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v7, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, s4, 5, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
-; GFX11-SDAG-NEXT:  .LBB14_6: ; %bb.1
-; GFX11-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-SDAG-NEXT:  .LBB14_2: ; %bb.1
+; GFX11-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX11-SDAG-NEXT:  .LBB14_7: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB14_7
-; GFX11-SDAG-NEXT:  ; %bb.8:
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    ds_swizzle_b32 v3, v2 offset:swizzle(BROADCAST,32,15)
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_max_u32_e32 v2, v2, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s1, v2, 31
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, s32
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v6, 2
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s0
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s33 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s1 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v6, s0 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s7
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s34
-; GFX11-SDAG-NEXT:    s_mov_b32 s34, s8
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s6
+; GFX11-SDAG-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-SDAG-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX11-SDAG-NEXT:    scratch_load_b32 v2, off, s33 offset:64
+; GFX11-SDAG-NEXT:    scratch_load_b32 v3, off, s33 offset:68
+; GFX11-SDAG-NEXT:    scratch_load_b32 v4, off, s33 offset:72
+; GFX11-SDAG-NEXT:    scratch_load_b32 v5, off, s33 offset:76
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s5
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s7, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
 ; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 63
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s34
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v2, s33 offset:64
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v3, s33 offset:68
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v4, s33 offset:72
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v5, s33 offset:76
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s34
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xc0
 ; GFX11-GISEL-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX11-GISEL-NEXT:    s_cbranch_execz .LBB14_6
+; GFX11-GISEL-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX11-GISEL-NEXT:  ; %bb.1: ; %bb.0
-; GFX11-GISEL-NEXT:    v_lshl_add_u32 v2, v1, 2, 15
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
-; GFX11-GISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX11-GISEL-NEXT:    s_mov_b32 s2, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, -16, v2
-; GFX11-GISEL-NEXT:  .LBB14_2: ; =>This Inner Loop Header: Depth=1
-; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s4, s3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_readlane_b32 s5, v2, s4
-; GFX11-GISEL-NEXT:    s_bitset0_b32 s3, s4
-; GFX11-GISEL-NEXT:    s_max_u32 s2, s2, s5
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB14_2
-; GFX11-GISEL-NEXT:  ; %bb.3:
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, 0x3ff, v31
 ; GFX11-GISEL-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
-; GFX11-GISEL-NEXT:    s_lshl_b32 s5, s2, 5
-; GFX11-GISEL-NEXT:    s_add_u32 s2, s32, 0x7ff
-; GFX11-GISEL-NEXT:    s_mov_b32 s4, exec_lo
-; GFX11-GISEL-NEXT:    s_and_b32 s2, s2, 0xfffff800
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v6, v6, 2, 15
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, -16, v1
-; GFX11-GISEL-NEXT:    s_mov_b32 s3, 0
-; GFX11-GISEL-NEXT:    s_add_u32 s32, s2, s5
-; GFX11-GISEL-NEXT:  .LBB14_4: ; =>This Inner Loop Header: Depth=1
-; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s5, s4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_readlane_b32 s6, v1, s5
-; GFX11-GISEL-NEXT:    s_bitset0_b32 s4, s5
-; GFX11-GISEL-NEXT:    s_max_u32 s3, s3, s6
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB14_4
-; GFX11-GISEL-NEXT:  ; %bb.5:
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v2, 4
-; GFX11-GISEL-NEXT:    s_mov_b32 s4, s32
-; GFX11-GISEL-NEXT:    s_lshl_b32 s3, s3, 5
-; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s2 dlc
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v6, -16, v6
+; GFX11-GISEL-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v1, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v6, s1
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    ds_swizzle_b32 v4, v2 offset:swizzle(BROADCAST,32,15)
+; GFX11-GISEL-NEXT:    ds_swizzle_b32 v5, v3 offset:swizzle(BROADCAST,32,15)
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX11-GISEL-NEXT:    v_max_u32_e32 v2, v2, v4
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_max_u32_e32 v3, v3, v5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s2, v2, 31
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v3, 31
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-GISEL-NEXT:    s_add_u32 s1, s32, 0x7ff
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v6, 4
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, 0xfffff800
+; GFX11-GISEL-NEXT:    s_lshl_b32 s2, s2, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s2
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT:    scratch_store_b32 off, v2, s4 dlc
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s2, s3, 5
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v6, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT:    s_add_u32 s32, s4, s3
-; GFX11-GISEL-NEXT:  .LBB14_6: ; %bb.1
-; GFX11-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s2
+; GFX11-GISEL-NEXT:  .LBB14_2: ; %bb.1
+; GFX11-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, s32
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX11-GISEL-NEXT:  .LBB14_7: ; =>This Inner Loop Header: Depth=1
-; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB14_7
-; GFX11-GISEL-NEXT:  ; %bb.8:
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    ds_swizzle_b32 v3, v2 offset:swizzle(BROADCAST,32,15)
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_max_u32_e32 v2, v2, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s2, v2, 31
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
-; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s2, 5
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s0, s1
 ; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s33 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s0 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
 ; GFX11-GISEL-NEXT:    s_mov_b32 s32, s34
-; GFX11-GISEL-NEXT:    s_mov_b32 s34, s8
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s7
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s5
+; GFX11-GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX11-GISEL-NEXT:    scratch_load_b32 v2, off, s33 offset:64
+; GFX11-GISEL-NEXT:    scratch_load_b32 v3, off, s33 offset:68
+; GFX11-GISEL-NEXT:    scratch_load_b32 v4, off, s33 offset:72
+; GFX11-GISEL-NEXT:    scratch_load_b32 v5, off, s33 offset:76
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %cond = icmp eq i32 %n, 0
@@ -2173,55 +2723,70 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s12, s33
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s33
 ; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0xfc0
-; GFX9-SDAG-NEXT:    s_mov_b32 s13, s34
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
-; GFX9-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s34
+; GFX9-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x2000
 ; GFX9-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB15_4
+; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB15_2
 ; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb.1
 ; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v1, 2, 15
-; GFX9-SDAG-NEXT:    s_and_b32 s9, s6, 0xfffff000
+; GFX9-SDAG-NEXT:    s_and_b32 s8, s6, 0xfffff000
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-SDAG-NEXT:  .LBB15_2: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s10, s[6:7]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s11, v0, s10
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s10
-; GFX9-SDAG-NEXT:    s_max_u32 s8, s8, s11
-; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB15_2
-; GFX9-SDAG-NEXT:  ; %bb.3:
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s9
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s8, 6, v0
+; GFX9-SDAG-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[6:7]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    v_readlane_b32 s9, v2, 63
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s9, 6, v0
 ; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    ; implicit-def: $vgpr31
-; GFX9-SDAG-NEXT:  .LBB15_4: ; %Flow
+; GFX9-SDAG-NEXT:  .LBB15_2: ; %Flow
 ; GFX9-SDAG-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB15_8
-; GFX9-SDAG-NEXT:  ; %bb.5: ; %bb.0
+; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB15_4
+; GFX9-SDAG-NEXT:  ; %bb.3: ; %bb.0
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
-; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
-; GFX9-SDAG-NEXT:  .LBB15_6: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s10, v0, s9
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX9-SDAG-NEXT:    s_max_u32 s8, s8, s10
-; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB15_6
-; GFX9-SDAG-NEXT:  ; %bb.7:
+; GFX9-SDAG-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[6:7]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v2, 63
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-SDAG-NEXT:    s_mov_b32 s6, s32
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s8, 6, v0
@@ -2229,39 +2794,52 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s6
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:  .LBB15_8: ; %bb.2
+; GFX9-SDAG-NEXT:  .LBB15_4: ; %bb.2
 ; GFX9-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, s34
-; GFX9-SDAG-NEXT:    s_mov_b32 s34, s13
-; GFX9-SDAG-NEXT:    s_mov_b32 s33, s12
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s11
+; GFX9-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s10
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s11, s33
+; GFX9-GISEL-NEXT:    s_mov_b32 s10, s33
 ; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0xfc0
-; GFX9-GISEL-NEXT:    s_mov_b32 s12, s34
-; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
-; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_mov_b32 s11, s34
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x2000
 ; GFX9-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-GISEL-NEXT:    s_cbranch_execz .LBB15_4
+; GFX9-GISEL-NEXT:    s_cbranch_execz .LBB15_2
 ; GFX9-GISEL-NEXT:  ; %bb.1: ; %bb.1
 ; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v1, 2, 15
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-GISEL-NEXT:  .LBB15_2: ; =>This Inner Loop Header: Depth=1
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX9-GISEL-NEXT:    v_readlane_b32 s10, v0, s9
-; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX9-GISEL-NEXT:    s_max_u32 s8, s8, s10
-; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB15_2
-; GFX9-GISEL-NEXT:  ; %bb.3:
+; GFX9-GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[6:7]
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v2, 63
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-GISEL-NEXT:    s_add_u32 s7, s32, 0xfff
 ; GFX9-GISEL-NEXT:    s_and_b32 s7, s7, 0xfffff000
 ; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s8, 6
@@ -2271,174 +2849,222 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    ; implicit-def: $vgpr31
-; GFX9-GISEL-NEXT:  .LBB15_4: ; %Flow
+; GFX9-GISEL-NEXT:  .LBB15_2: ; %Flow
 ; GFX9-GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-GISEL-NEXT:    s_cbranch_execz .LBB15_8
-; GFX9-GISEL-NEXT:  ; %bb.5: ; %bb.0
+; GFX9-GISEL-NEXT:    s_cbranch_execz .LBB15_4
+; GFX9-GISEL-NEXT:  ; %bb.3: ; %bb.0
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
 ; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, s32
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
-; GFX9-GISEL-NEXT:  .LBB15_6: ; =>This Inner Loop Header: Depth=1
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX9-GISEL-NEXT:    v_readlane_b32 s10, v0, s9
-; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX9-GISEL-NEXT:    s_max_u32 s8, s8, s10
-; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB15_6
-; GFX9-GISEL-NEXT:  ; %bb.7:
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, s32
-; GFX9-GISEL-NEXT:    s_lshl_b32 s7, s8, 6
+; GFX9-GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[6:7]
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    v_readlane_b32 s9, v2, 63
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s9, 6
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-GISEL-NEXT:    s_add_u32 s32, s6, s7
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s8
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s8, s6
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:  .LBB15_8: ; %bb.2
+; GFX9-GISEL-NEXT:  .LBB15_4: ; %bb.2
 ; GFX9-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-GISEL-NEXT:    s_mov_b32 s32, s34
-; GFX9-GISEL-NEXT:    s_mov_b32 s34, s12
-; GFX9-GISEL-NEXT:    s_mov_b32 s33, s11
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s11
+; GFX9-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s10
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow:
 ; GFX11-SDAG:       ; %bb.0: ; %entry
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_mov_b32 s6, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, s33
 ; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 63
-; GFX11-SDAG-NEXT:    s_mov_b32 s7, s34
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
-; GFX11-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-SDAG-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-SDAG-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s33
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v3, s33 offset:4
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s34
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0x80
 ; GFX11-SDAG-NEXT:    v_cmpx_ne_u32_e32 0, v0
 ; GFX11-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB15_4
+; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB15_2
 ; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.1
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v1, 2, 15
-; GFX11-SDAG-NEXT:    s_add_i32 s2, s32, 0x7ff
-; GFX11-SDAG-NEXT:    s_mov_b32 s3, exec_lo
-; GFX11-SDAG-NEXT:    s_and_b32 s2, s2, 0xfffff800
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX11-SDAG-NEXT:  .LBB15_2: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX11-SDAG-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s5, v0, s4
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s3, s4
-; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s5
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB15_2
-; GFX11-SDAG-NEXT:  ; %bb.3:
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s2
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    ds_swizzle_b32 v3, v2 offset:swizzle(BROADCAST,32,15)
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_max_u32_e32 v2, v2, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s2, v2, 31
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-SDAG-NEXT:    s_add_i32 s1, s32, 0x7ff
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 2
+; GFX11-SDAG-NEXT:    s_and_b32 s1, s1, 0xfffff800
 ; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr31
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s2 dlc
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s2, 5, s1
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:  .LBB15_4: ; %Flow
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:  .LBB15_2: ; %Flow
 ; GFX11-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB15_8
-; GFX11-SDAG-NEXT:  ; %bb.5: ; %bb.0
+; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB15_4
+; GFX11-SDAG-NEXT:  ; %bb.3: ; %bb.0
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
-; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
-; GFX11-SDAG-NEXT:  .LBB15_6: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v0, s3
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
-; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s4
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB15_6
-; GFX11-SDAG-NEXT:  ; %bb.7:
-; GFX11-SDAG-NEXT:    s_mov_b32 s2, s32
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    ds_swizzle_b32 v3, v2 offset:swizzle(BROADCAST,32,15)
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_max_u32_e32 v2, v2, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s2, v2, 31
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 1
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s2
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s2 dlc
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s2, 5, s1
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:  .LBB15_8: ; %bb.2
+; GFX11-SDAG-NEXT:  .LBB15_4: ; %bb.2
 ; GFX11-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s34
-; GFX11-SDAG-NEXT:    s_mov_b32 s34, s7
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s6
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s4
+; GFX11-SDAG-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-SDAG-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
+; GFX11-SDAG-NEXT:    scratch_load_b32 v2, off, s33
+; GFX11-SDAG-NEXT:    scratch_load_b32 v3, off, s33 offset:4
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s3
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s5, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
 ; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 63
-; GFX11-GISEL-NEXT:    s_mov_b32 s6, s34
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v2, s33
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v3, s33 offset:4
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s34
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0x80
 ; GFX11-GISEL-NEXT:    v_cmpx_ne_u32_e32 0, v0
 ; GFX11-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-GISEL-NEXT:    s_cbranch_execz .LBB15_4
+; GFX11-GISEL-NEXT:    s_cbranch_execz .LBB15_2
 ; GFX11-GISEL-NEXT:  ; %bb.1: ; %bb.1
 ; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v1, 2, 15
-; GFX11-GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX11-GISEL-NEXT:  .LBB15_2: ; =>This Inner Loop Header: Depth=1
-; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-GISEL-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_readlane_b32 s4, v0, s3
-; GFX11-GISEL-NEXT:    s_bitset0_b32 s2, s3
-; GFX11-GISEL-NEXT:    s_max_u32 s1, s1, s4
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB15_2
-; GFX11-GISEL-NEXT:  ; %bb.3:
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    ds_swizzle_b32 v3, v2 offset:swizzle(BROADCAST,32,15)
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_max_u32_e32 v2, v2, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s2, v2, 31
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 2
-; GFX11-GISEL-NEXT:    s_add_u32 s2, s32, 0x7ff
-; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s1, 5
-; GFX11-GISEL-NEXT:    s_and_b32 s2, s2, 0xfffff800
+; GFX11-GISEL-NEXT:    s_add_u32 s1, s32, 0x7ff
+; GFX11-GISEL-NEXT:    s_lshl_b32 s2, s2, 5
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, 0xfffff800
 ; GFX11-GISEL-NEXT:    ; implicit-def: $vgpr31
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_add_u32 s32, s2, s1
-; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s2 dlc
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s2
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT:  .LBB15_4: ; %Flow
+; GFX11-GISEL-NEXT:  .LBB15_2: ; %Flow
 ; GFX11-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-GISEL-NEXT:    s_cbranch_execz .LBB15_8
-; GFX11-GISEL-NEXT:  ; %bb.5: ; %bb.0
+; GFX11-GISEL-NEXT:    s_cbranch_execz .LBB15_4
+; GFX11-GISEL-NEXT:  ; %bb.3: ; %bb.0
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
-; GFX11-GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX11-GISEL-NEXT:  .LBB15_6: ; =>This Inner Loop Header: Depth=1
-; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-GISEL-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_readlane_b32 s4, v0, s3
-; GFX11-GISEL-NEXT:    s_bitset0_b32 s2, s3
-; GFX11-GISEL-NEXT:    s_max_u32 s1, s1, s4
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB15_6
-; GFX11-GISEL-NEXT:  ; %bb.7:
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    ds_swizzle_b32 v3, v2 offset:swizzle(BROADCAST,32,15)
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_max_u32_e32 v2, v2, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v2, 31
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 1
-; GFX11-GISEL-NEXT:    s_mov_b32 s2, s32
-; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s1, 5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_add_u32 s32, s2, s1
-; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s2 dlc
+; GFX11-GISEL-NEXT:    s_lshl_b32 s2, s3, 5
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s2
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT:  .LBB15_8: ; %bb.2
+; GFX11-GISEL-NEXT:  .LBB15_4: ; %bb.2
 ; GFX11-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-GISEL-NEXT:    s_mov_b32 s32, s34
-; GFX11-GISEL-NEXT:    s_mov_b32 s34, s6
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s5
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s5
+; GFX11-GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
+; GFX11-GISEL-NEXT:    scratch_load_b32 v2, off, s33
+; GFX11-GISEL-NEXT:    scratch_load_b32 v3, off, s33 offset:4
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %cond = icmp eq i32 %n, 0
@@ -2460,22 +3086,31 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
 ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, s33
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
-; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fff0, v0
-; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-SDAG-NEXT:  .LBB16_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
-; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB16_1
-; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fff0, v0
+; GFX9-SDAG-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[4:5]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    v_readlane_b32 s6, v1, 63
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
@@ -2484,101 +3119,148 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, s33
-; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s7
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, s33
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s32
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
-; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-GISEL-NEXT:  .LBB16_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
-; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB16_1
-; GFX9-GISEL-NEXT:  ; %bb.2:
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
-; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
-; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[4:5]
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    v_readlane_b32 s7, v1, 63
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s7, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s6, s4
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_mov_b32 s32, s33
-; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s8
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b16_e32 v1.h, 0
-; GFX11-SDAG-NEXT:    v_mov_b16_e32 v1.l, v0.l
-; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, s33
 ; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v1, 2, 15
+; GFX11-SDAG-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-SDAG-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s33
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s33 offset:4
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX11-SDAG-NEXT:    v_mov_b16_e32 v3.l, v0.l
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v3, 2, 15
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fff0, v0
-; GFX11-SDAG-NEXT:  .LBB16_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB16_1
-; GFX11-SDAG-NEXT:  ; %bb.2:
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x29a
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s1, v1, 31
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x29a
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v3, s0 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s33
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-SDAG-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
+; GFX11-SDAG-NEXT:    scratch_load_b32 v1, off, s33
+; GFX11-SDAG-NEXT:    scratch_load_b32 v2, off, s33 offset:4
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, s33
 ; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
-; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s33
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v2, s33 offset:4
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, s32
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX11-GISEL-NEXT:  .LBB16_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB16_1
-; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s2, v1, 31
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
-; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
-; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
-; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s2, 5
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s0, s1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s32, s33
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s0 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
+; GFX11-GISEL-NEXT:    scratch_load_b32 v1, off, s33
+; GFX11-GISEL-NEXT:    scratch_load_b32 v2, off, s33 offset:4
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s3
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i16 %n, align 2, addrspace(5)
   store volatile i32 666, ptr addrspace(5) %alloca
@@ -2589,21 +3271,30 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64
 ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
-; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, s33
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-SDAG-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
-; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB17_1
-; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[4:5]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-SDAG-NEXT:    v_readlane_b32 s6, v1, 63
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
@@ -2612,95 +3303,143 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, s33
-; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-SDAG-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s7
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, s33
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-GISEL-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
-; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB17_1
-; GFX9-GISEL-NEXT:  ; %bb.2:
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
-; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
-; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s32
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[4:5]
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    s_nop 1
+; GFX9-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9-GISEL-NEXT:    v_readlane_b32 s7, v1, 63
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s7, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s6, s4
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_mov_b32 s32, s33
-; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-GISEL-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s8
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, s33
 ; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-SDAG-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s33
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s33 offset:4
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-SDAG-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB17_1
-; GFX11-SDAG-NEXT:  ; %bb.2:
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x29a
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-SDAG-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s1, v1, 31
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x29a
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v3, s0 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s33
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-SDAG-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
+; GFX11-SDAG-NEXT:    scratch_load_b32 v1, off, s33
+; GFX11-SDAG-NEXT:    scratch_load_b32 v2, off, s33 offset:4
+; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, s33
 ; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s33
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v2, s33 offset:4
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-GISEL-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, s32
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB17_1
-; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX11-GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s2, v1, 31
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
-; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
-; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
-; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s2, 5
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s0, s1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s32, s33
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s0 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
+; GFX11-GISEL-NEXT:    scratch_load_b32 v1, off, s33
+; GFX11-GISEL-NEXT:    scratch_load_b32 v2, off, s33 offset:4
+; GFX11-GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s3
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i64 %n, align 2, addrspace(5)
   store volatile i32 666, ptr addrspace(5) %alloca
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
index e58bf6280a1f2..a83881a71e6cb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
@@ -463,6 +463,911 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @uniform_value_dpp(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_dpp:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_dpp:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_dpp:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_dpp:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value_dpp:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_clause 0x1
+; GFX1064DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value_dpp:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_clause 0x1
+; GFX1064GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value_dpp:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_clause 0x1
+; GFX1032DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value_dpp:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_clause 0x1
+; GFX1032GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_dpp:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_dpp:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_clause 0x1
+; GFX1164GISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_dpp:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_dpp:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_clause 0x1
+; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+;
+; GFX12DAGISEL-LABEL: uniform_value_dpp:
+; GFX12DAGISEL:       ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12DAGISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %in, i32 2)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_value_dpp(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value_dpp:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value_dpp:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value_dpp:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value_dpp:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_add_nc_i32 v2, v1, v2
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_add_nc_i32 v1, v1, v2
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value_dpp:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_add_nc_i32 v2, v1, v2
+; GFX1064GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_add_nc_i32 v1, v1, v2
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v4, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value_dpp:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_add_nc_i32 v1, v1, v2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v3, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value_dpp:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_add_nc_i32 v1, v1, v2
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value_dpp:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_add_nc_i32 v2, v1, v2
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_add_nc_i32 v1, v1, v2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value_dpp:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_add_nc_i32 v2, v1, v2
+; GFX1164GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_add_nc_i32 v1, v1, v2
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v4, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value_dpp:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_add_nc_i32 v1, v1, v2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value_dpp:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_add_nc_i32 v1, v1, v2
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+;
+; GFX12DAGISEL-LABEL: divergent_value_dpp:
+; GFX12DAGISEL:       ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX12DAGISEL-NEXT:    s_wait_dscnt 0x0
+; GFX12DAGISEL-NEXT:    v_add_nc_i32 v1, v1, v2
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX12DAGISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %id.x, i32 2)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: default_stratergy:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: default_stratergy:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: default_stratergy:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: default_stratergy:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: default_stratergy:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_add_nc_i32 v2, v1, v2
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_add_nc_i32 v1, v1, v2
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: default_stratergy:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_add_nc_i32 v2, v1, v2
+; GFX1064GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_add_nc_i32 v1, v1, v2
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v4, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: default_stratergy:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_add_nc_i32 v1, v1, v2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v3, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: default_stratergy:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_add_nc_i32 v1, v1, v2
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: default_stratergy:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_add_nc_i32 v2, v1, v2
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_add_nc_i32 v1, v1, v2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: default_stratergy:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_add_nc_i32 v2, v1, v2
+; GFX1164GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_add_nc_i32 v1, v1, v2
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v4, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: default_stratergy:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_add_nc_i32 v1, v1, v2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: default_stratergy:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_add_nc_i32 v1, v1, v2
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+;
+; GFX12DAGISEL-LABEL: default_stratergy:
+; GFX12DAGISEL:       ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX12DAGISEL-NEXT:    s_wait_dscnt 0x0
+; GFX12DAGISEL-NEXT:    v_add_nc_i32 v1, v1, v2
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX12DAGISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %id.x, i32 0)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-LABEL: divergent_cfg:
 ; GFX8DAGISEL:       ; %bb.0: ; %entry
@@ -470,7 +1375,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -478,24 +1383,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX8DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX8DAGISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8DAGISEL-NEXT:    s_add_i32 s6, s6, s8
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX8DAGISEL-NEXT:  ; %bb.5:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -510,7 +1415,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -518,20 +1423,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mul_i32 s6, s6, s2
-; GFX8GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8GISEL-NEXT:    s_add_i32 s6, s6, s8
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX8GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX8GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
@@ -547,7 +1452,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -555,24 +1460,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX9DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX9DAGISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9DAGISEL-NEXT:    s_add_i32 s6, s6, s8
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX9DAGISEL-NEXT:  ; %bb.5:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -586,7 +1491,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -594,20 +1499,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mul_i32 s6, s6, s2
-; GFX9GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9GISEL-NEXT:    s_add_i32 s6, s6, s8
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX9GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX9GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -622,7 +1527,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -630,24 +1535,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX1064DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1064DAGISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064DAGISEL-NEXT:    s_add_i32 s6, s6, s8
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1064DAGISEL-NEXT:  ; %bb.5:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1064DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -661,7 +1566,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -669,20 +1574,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mul_i32 s6, s6, s2
-; GFX1064GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064GISEL-NEXT:    s_add_i32 s6, s6, s8
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1064GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1064GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -697,7 +1602,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr1
 ; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032DAGISEL-NEXT:    s_load_dword s1, s[4:5], 0x2c
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -705,24 +1610,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
-; GFX1032DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1032DAGISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1032DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032DAGISEL-NEXT:    s_add_i32 s1, s1, s6
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1032DAGISEL-NEXT:  ; %bb.5:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1032DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -736,7 +1641,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr0
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -744,20 +1649,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mul_i32 s0, s0, s2
-; GFX1032GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX1032GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032GISEL-NEXT:    s_add_i32 s0, s0, s6
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1032GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1032GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -774,7 +1679,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164DAGISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -783,25 +1688,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX1164DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1164DAGISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1164DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164DAGISEL-NEXT:    s_add_i32 s6, s6, s8
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1164DAGISEL-NEXT:  ; %bb.5:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1164DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -817,7 +1722,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -826,21 +1731,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mul_i32 s6, s6, s2
-; GFX1164GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1164GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164GISEL-NEXT:    s_add_i32 s6, s6, s8
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1164GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1164GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -857,7 +1762,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132DAGISEL-NEXT:    s_load_b32 s1, s[4:5], 0x2c
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -866,25 +1771,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
-; GFX1132DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1132DAGISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1132DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_add_i32 s1, s1, s6
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1132DAGISEL-NEXT:  ; %bb.5:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1132DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -900,7 +1805,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -909,21 +1814,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mul_i32 s0, s0, s2
-; GFX1132GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX1132GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_add_i32 s0, s0, s6
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1132GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1132GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
@@ -939,7 +1844,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX12DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX12DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX12DAGISEL-NEXT:    s_load_b32 s1, s[4:5], 0x2c
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -948,15 +1853,15 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX12DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
 ; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
-; GFX12DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX12DAGISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX12DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX12DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
@@ -964,10 +1869,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX12DAGISEL-NEXT:    s_add_co_i32 s1, s1, s6
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX12DAGISEL-NEXT:  ; %bb.5:
 ; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX12DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX12DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1215,7 +2120,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -1223,7 +2128,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s8
 ; GFX8DAGISEL-NEXT:    s_addc_u32 s5, s5, s10
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8DAGISEL-NEXT:  ; %bb.2:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1236,7 +2141,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX8GISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -1244,7 +2149,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s8
 ; GFX8GISEL-NEXT:    s_addc_u32 s5, s5, s10
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8GISEL-NEXT:  ; %bb.2:
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1257,7 +2162,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -1265,7 +2170,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s8
 ; GFX9DAGISEL-NEXT:    s_addc_u32 s5, s5, s10
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9DAGISEL-NEXT:  ; %bb.2:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1278,7 +2183,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX9GISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -1286,7 +2191,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s8
 ; GFX9GISEL-NEXT:    s_addc_u32 s5, s5, s10
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9GISEL-NEXT:  ; %bb.2:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1299,7 +2204,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -1307,7 +2212,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064DAGISEL-NEXT:    s_add_u32 s4, s4, s9
 ; GFX1064DAGISEL-NEXT:    s_addc_u32 s5, s5, s10
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064DAGISEL-NEXT:  ; %bb.2:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1319,7 +2224,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -1327,7 +2232,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064GISEL-NEXT:    s_add_u32 s4, s4, s9
 ; GFX1064GISEL-NEXT:    s_addc_u32 s5, s5, s10
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064GISEL-NEXT:  ; %bb.2:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1339,7 +2244,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
@@ -1347,7 +2252,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032DAGISEL-NEXT:    s_add_u32 s4, s4, s8
 ; GFX1032DAGISEL-NEXT:    s_addc_u32 s5, s5, s9
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032DAGISEL-NEXT:  ; %bb.2:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1359,7 +2264,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
@@ -1367,7 +2272,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032GISEL-NEXT:    s_add_u32 s4, s4, s8
 ; GFX1032GISEL-NEXT:    s_addc_u32 s5, s5, s9
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032GISEL-NEXT:  ; %bb.2:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1379,7 +2284,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s4, s[2:3]
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s5, v2, s4
@@ -1388,7 +2293,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_add_u32 s0, s0, s5
 ; GFX1164DAGISEL-NEXT:    s_addc_u32 s1, s1, s6
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164DAGISEL-NEXT:  ; %bb.2:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1400,7 +2305,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s4, s[2:3]
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s5, v2, s4
@@ -1409,7 +2314,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_add_u32 s0, s0, s5
 ; GFX1164GISEL-NEXT:    s_addc_u32 s1, s1, s6
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164GISEL-NEXT:  ; %bb.2:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1421,7 +2326,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -1430,7 +2335,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_add_u32 s0, s0, s4
 ; GFX1132DAGISEL-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1441,7 +2346,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -1450,7 +2355,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_add_u32 s0, s0, s4
 ; GFX1132GISEL-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132GISEL-NEXT:  ; %bb.2:
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1465,7 +2370,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX12DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
@@ -1475,7 +2380,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX12DAGISEL-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX12DAGISEL-NEXT:  ; %bb.2:
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
@@ -1496,7 +2401,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s7, s[6:7]
@@ -1505,7 +2410,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8DAGISEL-NEXT:    s_mul_hi_u32 s2, s2, s7
 ; GFX8DAGISEL-NEXT:    s_mul_i32 s3, s3, s7
 ; GFX8DAGISEL-NEXT:    s_add_u32 s7, s2, s3
-; GFX8DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX8DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -1534,7 +2439,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s7, s[6:7]
@@ -1543,10 +2448,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    s_mul_hi_u32 s2, s2, s7
 ; GFX8GISEL-NEXT:    s_mul_i32 s3, s3, s7
 ; GFX8GISEL-NEXT:    s_add_u32 s7, s2, s3
-; GFX8GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
@@ -1556,7 +2461,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    s_mul_hi_u32 s4, s4, s7
 ; GFX8GISEL-NEXT:    s_mul_i32 s5, s5, s7
 ; GFX8GISEL-NEXT:    s_add_u32 s7, s4, s5
-; GFX8GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX8GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1573,7 +2478,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s5, s[4:5]
@@ -1582,7 +2487,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9DAGISEL-NEXT:    s_mul_hi_u32 s2, s2, s5
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s3, s3, s5
 ; GFX9DAGISEL-NEXT:    s_add_u32 s5, s2, s3
-; GFX9DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX9DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
@@ -1610,7 +2515,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s7, s[6:7]
@@ -1619,10 +2524,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    s_mul_hi_u32 s2, s2, s7
 ; GFX9GISEL-NEXT:    s_mul_i32 s3, s3, s7
 ; GFX9GISEL-NEXT:    s_add_u32 s7, s2, s3
-; GFX9GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
@@ -1632,7 +2537,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    s_mul_hi_u32 s5, s8, s4
 ; GFX9GISEL-NEXT:    s_mul_i32 s4, s9, s4
 ; GFX9GISEL-NEXT:    s_add_u32 s7, s5, s4
-; GFX9GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX9GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1649,7 +2554,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr8_sgpr9
 ; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
@@ -1658,7 +2563,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064DAGISEL-NEXT:    s_mul_i32 s3, s3, s8
 ; GFX1064DAGISEL-NEXT:    s_mul_i32 s8, s2, s8
 ; GFX1064DAGISEL-NEXT:    s_add_u32 s9, s9, s3
-; GFX1064DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1064DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[4:5]
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s8
@@ -1686,7 +2591,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -1695,10 +2600,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    s_mul_i32 s3, s3, s6
 ; GFX1064GISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX1064GISEL-NEXT:    s_add_u32 s7, s7, s3
-; GFX1064GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
@@ -1708,7 +2613,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    s_mul_i32 s7, s7, s4
 ; GFX1064GISEL-NEXT:    s_mul_i32 s6, s6, s4
 ; GFX1064GISEL-NEXT:    s_add_u32 s7, s5, s7
-; GFX1064GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1064GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1725,7 +2630,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
@@ -1734,7 +2639,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032DAGISEL-NEXT:    s_mul_i32 s3, s3, s4
 ; GFX1032DAGISEL-NEXT:    s_mul_i32 s4, s2, s4
 ; GFX1032DAGISEL-NEXT:    s_add_u32 s5, s5, s3
-; GFX1032DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1032DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
@@ -1762,7 +2667,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s6, s6
@@ -1771,10 +2676,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    s_mul_i32 s3, s3, s6
 ; GFX1032GISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX1032GISEL-NEXT:    s_add_u32 s7, s7, s3
-; GFX1032GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
@@ -1784,7 +2689,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    s_mul_i32 s5, s7, s3
 ; GFX1032GISEL-NEXT:    s_mul_i32 s6, s6, s3
 ; GFX1032GISEL-NEXT:    s_add_u32 s7, s4, s5
-; GFX1032GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1032GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1803,7 +2708,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1813,7 +2718,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164DAGISEL-NEXT:    s_mul_i32 s3, s3, s8
 ; GFX1164DAGISEL-NEXT:    s_mul_i32 s8, s2, s8
 ; GFX1164DAGISEL-NEXT:    s_add_u32 s9, s9, s3
-; GFX1164DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1164DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[6:7]
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s8
@@ -1845,7 +2750,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1855,10 +2760,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_mul_i32 s3, s3, s6
 ; GFX1164GISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX1164GISEL-NEXT:    s_add_u32 s7, s7, s3
-; GFX1164GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
@@ -1869,7 +2774,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_mul_i32 s5, s5, s6
 ; GFX1164GISEL-NEXT:    s_mul_i32 s6, s4, s6
 ; GFX1164GISEL-NEXT:    s_add_u32 s7, s7, s5
-; GFX1164GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1164GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1888,7 +2793,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1898,7 +2803,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132DAGISEL-NEXT:    s_mul_i32 s3, s3, s6
 ; GFX1132DAGISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX1132DAGISEL-NEXT:    s_add_u32 s7, s7, s3
-; GFX1132DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1132DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
@@ -1928,7 +2833,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1938,10 +2843,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_mul_i32 s3, s3, s6
 ; GFX1132GISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX1132GISEL-NEXT:    s_add_u32 s7, s7, s3
-; GFX1132GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
@@ -1952,7 +2857,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_mul_i32 s5, s5, s3
 ; GFX1132GISEL-NEXT:    s_mul_i32 s6, s4, s3
 ; GFX1132GISEL-NEXT:    s_add_u32 s7, s7, s5
-; GFX1132GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1132GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
@@ -1970,7 +2875,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX12DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX12DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX12DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1980,7 +2885,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX12DAGISEL-NEXT:    s_mul_i32 s3, s3, s6
 ; GFX12DAGISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX12DAGISEL-NEXT:    s_add_co_u32 s7, s7, s3
-; GFX12DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX12DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
index f39dd867f9580..38443f7004f72 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
@@ -367,6 +367,784 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @uniform_value_dpp(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_dpp:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_dpp:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_dpp:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_dpp:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value_dpp:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_clause 0x1
+; GFX10DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value_dpp:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_clause 0x1
+; GFX10GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_dpp:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_dpp:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_clause 0x1
+; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_dpp:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_dpp:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_clause 0x1
+; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %in, i32 2)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_value_dpp(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[2:3]
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value_dpp:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[2:3]
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value_dpp:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[2:3]
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value_dpp:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[2:3]
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value_dpp:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v2, v1, v2
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value_dpp:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_and_b32_e32 v2, v1, v2
+; GFX1064GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v4, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value_dpp:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
+; GFX1032DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v3, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value_dpp:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
+; GFX1032GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value_dpp:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v2, v1, v2
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value_dpp:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v2, v1, v2
+; GFX1164GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v4, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value_dpp:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value_dpp:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %id.x, i32 2)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: default_stratergy:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[2:3]
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: default_stratergy:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[2:3]
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: default_stratergy:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[2:3]
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: default_stratergy:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[2:3]
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: default_stratergy:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v2, v1, v2
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: default_stratergy:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_and_b32_e32 v2, v1, v2
+; GFX1064GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v4, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: default_stratergy:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
+; GFX1032DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v3, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: default_stratergy:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
+; GFX1032GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: default_stratergy:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v2, v1, v2
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: default_stratergy:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v2, v1, v2
+; GFX1164GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v4, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: default_stratergy:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: default_stratergy:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %id.x, i32 0)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-LABEL: divergent_cfg:
 ; GFX8DAGISEL:       ; %bb.0: ; %entry
@@ -382,20 +1160,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s6, -1
-; GFX8DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8DAGISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX8DAGISEL-NEXT:  ; %bb.5:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -410,26 +1188,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX8GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX8GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8GISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX8GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX8GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
@@ -453,20 +1231,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s6, -1
-; GFX9DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9DAGISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX9DAGISEL-NEXT:  ; %bb.5:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -480,26 +1258,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX9GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX9GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9GISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX9GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX9GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -522,20 +1300,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, -1
-; GFX1064DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064DAGISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1064DAGISEL-NEXT:  ; %bb.5:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1064DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -549,26 +1327,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX1064GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX1064GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064GISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1064GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1064GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -591,20 +1369,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s1, -1
-; GFX1032DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032DAGISEL-NEXT:    s_and_b32 s1, s1, s6
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1032DAGISEL-NEXT:  ; %bb.5:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1032DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -618,26 +1396,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr0
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b32 s0, s0
-; GFX1032GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032GISEL-NEXT:    s_mov_b32 s0, -1
-; GFX1032GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032GISEL-NEXT:    s_and_b32 s0, s0, s6
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1032GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1032GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -662,21 +1440,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s6, -1
-; GFX1164DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164DAGISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1164DAGISEL-NEXT:  ; %bb.5:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1164DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -692,27 +1470,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
 ; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX1164GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX1164GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164GISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1164GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1164GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -737,21 +1515,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, -1
-; GFX1132DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_and_b32 s1, s1, s6
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1132DAGISEL-NEXT:  ; %bb.5:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1132DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -767,27 +1545,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, s0
-; GFX1132GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, -1
-; GFX1132GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_and_b32 s0, s0, s6
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1132GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1132GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
@@ -925,14 +1703,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX8DAGISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8DAGISEL-NEXT:  ; %bb.2:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -945,14 +1723,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX8GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX8GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8GISEL-NEXT:  ; %bb.2:
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -965,14 +1743,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX9DAGISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9DAGISEL-NEXT:  ; %bb.2:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -985,14 +1763,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX9GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX9GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9GISEL-NEXT:  ; %bb.2:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1005,14 +1783,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX1064DAGISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064DAGISEL-NEXT:  ; %bb.2:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1024,14 +1802,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX1064GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064GISEL-NEXT:  ; %bb.2:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1043,14 +1821,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032DAGISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032DAGISEL-NEXT:  ; %bb.2:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1062,14 +1840,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032GISEL-NEXT:  ; %bb.2:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1081,7 +1859,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -1089,7 +1867,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s6
 ; GFX1164DAGISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164DAGISEL-NEXT:  ; %bb.2:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1101,7 +1879,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -1109,7 +1887,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s6
 ; GFX1164GISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164GISEL-NEXT:  ; %bb.2:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1121,7 +1899,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -1129,7 +1907,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1140,7 +1918,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -1148,7 +1926,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132GISEL-NEXT:  ; %bb.2:
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1189,19 +1967,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX8GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX8GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX8GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1238,19 +2016,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX9GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX9GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1287,19 +2065,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX1064GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1064GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1336,19 +2114,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX1032GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1032GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1389,19 +2167,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1164GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX1164GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1164GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1440,19 +2218,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1132GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX1132GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1132GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
index 6f299ab8bb9cf..3aac4a75c16c0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
@@ -367,6 +367,792 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @uniform_value_dpp(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_dpp:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_dpp:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_dpp:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_dpp:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value_dpp:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_clause 0x1
+; GFX10DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value_dpp:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_clause 0x1
+; GFX10GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_dpp:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_dpp:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_clause 0x1
+; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_dpp:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_dpp:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_clause 0x1
+; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %in, i32 2)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_value_dpp(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8DAGISEL-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[2:3]
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value_dpp:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8GISEL-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[2:3]
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value_dpp:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9DAGISEL-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[2:3]
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value_dpp:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9GISEL-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[2:3]
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value_dpp:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_max_i32_e32 v2, v1, v2
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_max_i32_e32 v1, v1, v2
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value_dpp:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_max_i32_e32 v2, v1, v2
+; GFX1064GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_max_i32_e32 v1, v1, v2
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v4, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value_dpp:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s0
+; GFX1032DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_max_i32_e32 v1, v1, v2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v3, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value_dpp:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s0
+; GFX1032GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_max_i32_e32 v1, v1, v2
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value_dpp:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_max_i32_e32 v2, v1, v2
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_max_i32_e32 v1, v1, v2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value_dpp:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_max_i32_e32 v2, v1, v2
+; GFX1164GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_max_i32_e32 v1, v1, v2
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v4, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value_dpp:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_max_i32_e32 v1, v1, v2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value_dpp:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_max_i32_e32 v1, v1, v2
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %id.x, i32 2)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: default_stratergy:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8DAGISEL-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[2:3]
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: default_stratergy:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8GISEL-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[2:3]
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: default_stratergy:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9DAGISEL-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[2:3]
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: default_stratergy:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9GISEL-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[2:3]
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: default_stratergy:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_max_i32_e32 v2, v1, v2
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_max_i32_e32 v1, v1, v2
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: default_stratergy:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_max_i32_e32 v2, v1, v2
+; GFX1064GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_max_i32_e32 v1, v1, v2
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v4, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: default_stratergy:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s0
+; GFX1032DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_max_i32_e32 v1, v1, v2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v3, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: default_stratergy:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s0
+; GFX1032GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_max_i32_e32 v1, v1, v2
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: default_stratergy:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_max_i32_e32 v2, v1, v2
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_max_i32_e32 v1, v1, v2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: default_stratergy:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_max_i32_e32 v2, v1, v2
+; GFX1164GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_max_i32_e32 v1, v1, v2
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v4, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: default_stratergy:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_max_i32_e32 v1, v1, v2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: default_stratergy:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_max_i32_e32 v1, v1, v2
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %id.x, i32 0)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-LABEL: divergent_cfg:
 ; GFX8DAGISEL:       ; %bb.0: ; %entry
@@ -382,20 +1168,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8DAGISEL-NEXT:    s_brev_b32 s6, 1
-; GFX8DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8DAGISEL-NEXT:    s_max_i32 s6, s6, s8
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX8DAGISEL-NEXT:  ; %bb.5:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -410,26 +1196,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX8GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8GISEL-NEXT:    s_brev_b32 s6, 1
-; GFX8GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8GISEL-NEXT:    s_max_i32 s6, s6, s8
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX8GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX8GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
@@ -453,20 +1239,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9DAGISEL-NEXT:    s_brev_b32 s6, 1
-; GFX9DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9DAGISEL-NEXT:    s_max_i32 s6, s6, s8
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX9DAGISEL-NEXT:  ; %bb.5:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -480,26 +1266,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX9GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9GISEL-NEXT:    s_brev_b32 s6, 1
-; GFX9GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9GISEL-NEXT:    s_max_i32 s6, s6, s8
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX9GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX9GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -522,20 +1308,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064DAGISEL-NEXT:    s_brev_b32 s6, 1
-; GFX1064DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064DAGISEL-NEXT:    s_max_i32 s6, s6, s8
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1064DAGISEL-NEXT:  ; %bb.5:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1064DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -549,26 +1335,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX1064GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064GISEL-NEXT:    s_brev_b32 s6, 1
-; GFX1064GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064GISEL-NEXT:    s_max_i32 s6, s6, s8
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1064GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1064GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -591,20 +1377,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_brev_b32 s1, 1
-; GFX1032DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032DAGISEL-NEXT:    s_max_i32 s1, s1, s6
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1032DAGISEL-NEXT:  ; %bb.5:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1032DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -618,26 +1404,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr0
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b32 s0, s0
-; GFX1032GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032GISEL-NEXT:    s_brev_b32 s0, 1
-; GFX1032GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032GISEL-NEXT:    s_max_i32 s0, s0, s6
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1032GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1032GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -662,21 +1448,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_brev_b32 s6, 1
-; GFX1164DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164DAGISEL-NEXT:    s_max_i32 s6, s6, s8
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1164DAGISEL-NEXT:  ; %bb.5:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1164DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -692,27 +1478,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
 ; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX1164GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_brev_b32 s6, 1
-; GFX1164GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164GISEL-NEXT:    s_max_i32 s6, s6, s8
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1164GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1164GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -737,21 +1523,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_brev_b32 s1, 1
-; GFX1132DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_max_i32 s1, s1, s6
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1132DAGISEL-NEXT:  ; %bb.5:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1132DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -767,27 +1553,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, s0
-; GFX1132GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132GISEL-NEXT:    s_brev_b32 s0, 1
-; GFX1132GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_max_i32 s0, s0, s6
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1132GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1132GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
@@ -926,7 +1712,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX8DAGISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -937,7 +1723,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX8DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8DAGISEL-NEXT:  ; %bb.2:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -951,7 +1737,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX8GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -962,7 +1748,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX8GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8GISEL-NEXT:  ; %bb.2:
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -976,7 +1762,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX9DAGISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -987,7 +1773,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX9DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9DAGISEL-NEXT:  ; %bb.2:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1001,7 +1787,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX9GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1012,7 +1798,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX9GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9GISEL-NEXT:  ; %bb.2:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1026,7 +1812,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1064DAGISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1037,7 +1823,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX1064DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064DAGISEL-NEXT:  ; %bb.2:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1050,7 +1836,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1064GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1061,7 +1847,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX1064GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064GISEL-NEXT:  ; %bb.2:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1074,7 +1860,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1032DAGISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1085,7 +1871,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032DAGISEL-NEXT:  ; %bb.2:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1098,7 +1884,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX1032GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1109,7 +1895,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032GISEL-NEXT:  ; %bb.2:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1122,7 +1908,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX1164DAGISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s0
@@ -1134,7 +1920,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s8
 ; GFX1164DAGISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164DAGISEL-NEXT:  ; %bb.2:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1147,7 +1933,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX1164GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, s0
@@ -1159,7 +1945,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s8
 ; GFX1164GISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164GISEL-NEXT:  ; %bb.2:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1172,7 +1958,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX1132DAGISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
@@ -1183,7 +1969,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1195,7 +1981,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX1132GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
@@ -1206,7 +1992,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132GISEL-NEXT:  ; %bb.2:
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1247,19 +2033,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX8GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX8GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX8GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1296,19 +2082,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX9GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX9GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1345,19 +2131,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX1064GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1064GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1394,19 +2180,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX1032GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1032GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1447,19 +2233,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1164GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX1164GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1164GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1498,19 +2284,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1132GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX1132GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1132GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
index 3c4cbc74aedc1..05043b8bdf6db 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
@@ -367,6 +367,792 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @uniform_value_dpp(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_dpp:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_dpp:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_dpp:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_dpp:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value_dpp:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_clause 0x1
+; GFX10DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value_dpp:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_clause 0x1
+; GFX10GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_dpp:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_dpp:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_clause 0x1
+; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_dpp:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_dpp:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_clause 0x1
+; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %in, i32 2)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_value_dpp(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8DAGISEL-NEXT:    v_bfrev_b32_e32 v1, -2
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[2:3]
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value_dpp:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8GISEL-NEXT:    v_bfrev_b32_e32 v1, -2
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[2:3]
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value_dpp:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9DAGISEL-NEXT:    v_bfrev_b32_e32 v1, -2
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[2:3]
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value_dpp:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9GISEL-NEXT:    v_bfrev_b32_e32 v1, -2
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[2:3]
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value_dpp:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_min_i32_e32 v2, v1, v2
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_min_i32_e32 v1, v1, v2
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value_dpp:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_min_i32_e32 v2, v1, v2
+; GFX1064GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_min_i32_e32 v1, v1, v2
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v4, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value_dpp:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0
+; GFX1032DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_min_i32_e32 v1, v1, v2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v3, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value_dpp:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0
+; GFX1032GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_min_i32_e32 v1, v1, v2
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value_dpp:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_min_i32_e32 v2, v1, v2
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_min_i32_e32 v1, v1, v2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value_dpp:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_min_i32_e32 v2, v1, v2
+; GFX1164GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_min_i32_e32 v1, v1, v2
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v4, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value_dpp:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_min_i32_e32 v1, v1, v2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value_dpp:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_min_i32_e32 v1, v1, v2
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %id.x, i32 2)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: default_stratergy:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8DAGISEL-NEXT:    v_bfrev_b32_e32 v1, -2
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[2:3]
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: default_stratergy:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8GISEL-NEXT:    v_bfrev_b32_e32 v1, -2
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[2:3]
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: default_stratergy:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9DAGISEL-NEXT:    v_bfrev_b32_e32 v1, -2
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[2:3]
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: default_stratergy:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9GISEL-NEXT:    v_bfrev_b32_e32 v1, -2
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[2:3]
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: default_stratergy:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_min_i32_e32 v2, v1, v2
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_min_i32_e32 v1, v1, v2
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: default_stratergy:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_min_i32_e32 v2, v1, v2
+; GFX1064GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_min_i32_e32 v1, v1, v2
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v4, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: default_stratergy:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0
+; GFX1032DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_min_i32_e32 v1, v1, v2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v3, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: default_stratergy:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0
+; GFX1032GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_min_i32_e32 v1, v1, v2
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: default_stratergy:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_min_i32_e32 v2, v1, v2
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_min_i32_e32 v1, v1, v2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: default_stratergy:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_min_i32_e32 v2, v1, v2
+; GFX1164GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_min_i32_e32 v1, v1, v2
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v4, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: default_stratergy:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_min_i32_e32 v1, v1, v2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: default_stratergy:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_min_i32_e32 v1, v1, v2
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %id.x, i32 0)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-LABEL: divergent_cfg:
 ; GFX8DAGISEL:       ; %bb.0: ; %entry
@@ -382,20 +1168,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8DAGISEL-NEXT:    s_brev_b32 s6, -2
-; GFX8DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8DAGISEL-NEXT:    s_min_i32 s6, s6, s8
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX8DAGISEL-NEXT:  ; %bb.5:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -410,26 +1196,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX8GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8GISEL-NEXT:    s_brev_b32 s6, -2
-; GFX8GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8GISEL-NEXT:    s_min_i32 s6, s6, s8
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX8GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX8GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
@@ -453,20 +1239,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9DAGISEL-NEXT:    s_brev_b32 s6, -2
-; GFX9DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9DAGISEL-NEXT:    s_min_i32 s6, s6, s8
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX9DAGISEL-NEXT:  ; %bb.5:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -480,26 +1266,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX9GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9GISEL-NEXT:    s_brev_b32 s6, -2
-; GFX9GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9GISEL-NEXT:    s_min_i32 s6, s6, s8
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX9GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX9GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -522,20 +1308,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064DAGISEL-NEXT:    s_brev_b32 s6, -2
-; GFX1064DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064DAGISEL-NEXT:    s_min_i32 s6, s6, s8
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1064DAGISEL-NEXT:  ; %bb.5:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1064DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -549,26 +1335,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX1064GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064GISEL-NEXT:    s_brev_b32 s6, -2
-; GFX1064GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064GISEL-NEXT:    s_min_i32 s6, s6, s8
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1064GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1064GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -591,20 +1377,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_brev_b32 s1, -2
-; GFX1032DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032DAGISEL-NEXT:    s_min_i32 s1, s1, s6
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1032DAGISEL-NEXT:  ; %bb.5:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1032DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -618,26 +1404,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr0
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b32 s0, s0
-; GFX1032GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032GISEL-NEXT:    s_brev_b32 s0, -2
-; GFX1032GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032GISEL-NEXT:    s_min_i32 s0, s0, s6
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1032GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1032GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -662,21 +1448,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_brev_b32 s6, -2
-; GFX1164DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164DAGISEL-NEXT:    s_min_i32 s6, s6, s8
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1164DAGISEL-NEXT:  ; %bb.5:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1164DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -692,27 +1478,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
 ; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX1164GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_brev_b32 s6, -2
-; GFX1164GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164GISEL-NEXT:    s_min_i32 s6, s6, s8
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1164GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1164GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -737,21 +1523,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_brev_b32 s1, -2
-; GFX1132DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_min_i32 s1, s1, s6
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1132DAGISEL-NEXT:  ; %bb.5:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1132DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -767,27 +1553,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, s0
-; GFX1132GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132GISEL-NEXT:    s_brev_b32 s0, -2
-; GFX1132GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_min_i32 s0, s0, s6
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1132GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1132GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
@@ -926,7 +1712,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s4, -1
 ; GFX8DAGISEL-NEXT:    s_brev_b32 s5, -2
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -937,7 +1723,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX8DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8DAGISEL-NEXT:  ; %bb.2:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -951,7 +1737,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8GISEL-NEXT:    s_mov_b32 s4, -1
 ; GFX8GISEL-NEXT:    s_brev_b32 s5, -2
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -962,7 +1748,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX8GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8GISEL-NEXT:  ; %bb.2:
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -976,7 +1762,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s4, -1
 ; GFX9DAGISEL-NEXT:    s_brev_b32 s5, -2
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -987,7 +1773,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX9DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9DAGISEL-NEXT:  ; %bb.2:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1001,7 +1787,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9GISEL-NEXT:    s_mov_b32 s4, -1
 ; GFX9GISEL-NEXT:    s_brev_b32 s5, -2
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1012,7 +1798,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX9GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9GISEL-NEXT:  ; %bb.2:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1026,7 +1812,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s4, -1
 ; GFX1064DAGISEL-NEXT:    s_brev_b32 s5, -2
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1037,7 +1823,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX1064DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064DAGISEL-NEXT:  ; %bb.2:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1050,7 +1836,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064GISEL-NEXT:    s_mov_b32 s4, -1
 ; GFX1064GISEL-NEXT:    s_brev_b32 s5, -2
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1061,7 +1847,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX1064GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064GISEL-NEXT:  ; %bb.2:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1074,7 +1860,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, -1
 ; GFX1032DAGISEL-NEXT:    s_brev_b32 s5, -2
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1085,7 +1871,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032DAGISEL-NEXT:  ; %bb.2:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1098,7 +1884,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032GISEL-NEXT:    s_mov_b32 s4, -1
 ; GFX1032GISEL-NEXT:    s_brev_b32 s5, -2
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1109,7 +1895,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032GISEL-NEXT:  ; %bb.2:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1122,7 +1908,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s0, -1
 ; GFX1164DAGISEL-NEXT:    s_brev_b32 s1, -2
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s0
@@ -1134,7 +1920,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s8
 ; GFX1164DAGISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164DAGISEL-NEXT:  ; %bb.2:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1147,7 +1933,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_mov_b32 s0, -1
 ; GFX1164GISEL-NEXT:    s_brev_b32 s1, -2
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, s0
@@ -1159,7 +1945,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s8
 ; GFX1164GISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164GISEL-NEXT:  ; %bb.2:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1172,7 +1958,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, -1
 ; GFX1132DAGISEL-NEXT:    s_brev_b32 s1, -2
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
@@ -1183,7 +1969,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1195,7 +1981,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, -1
 ; GFX1132GISEL-NEXT:    s_brev_b32 s1, -2
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
@@ -1206,7 +1992,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132GISEL-NEXT:  ; %bb.2:
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1247,19 +2033,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX8GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX8GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX8GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1296,19 +2082,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX9GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX9GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1345,19 +2131,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX1064GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1064GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1394,19 +2180,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX1032GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1032GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1447,19 +2233,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1164GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX1164GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1164GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1498,19 +2284,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1132GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX1132GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1132GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
index d6ccf7ce2831d..67d6eed743345 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
@@ -367,6 +367,784 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @uniform_value_dpp(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_dpp:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_dpp:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_dpp:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_dpp:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value_dpp:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_clause 0x1
+; GFX10DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value_dpp:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_clause 0x1
+; GFX10GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_dpp:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_dpp:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_clause 0x1
+; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_dpp:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_dpp:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_clause 0x1
+; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %in, i32 2)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_value_dpp(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value_dpp:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value_dpp:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value_dpp:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value_dpp:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_or_b32_e32 v2, v1, v2
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value_dpp:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_or_b32_e32 v2, v1, v2
+; GFX1064GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v4, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value_dpp:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v3, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value_dpp:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value_dpp:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_or_b32_e32 v2, v1, v2
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value_dpp:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_or_b32_e32 v2, v1, v2
+; GFX1164GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v4, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value_dpp:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value_dpp:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %id.x, i32 2)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: default_stratergy:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: default_stratergy:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: default_stratergy:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: default_stratergy:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: default_stratergy:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_or_b32_e32 v2, v1, v2
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: default_stratergy:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_or_b32_e32 v2, v1, v2
+; GFX1064GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v4, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: default_stratergy:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v3, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: default_stratergy:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: default_stratergy:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_or_b32_e32 v2, v1, v2
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: default_stratergy:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_or_b32_e32 v2, v1, v2
+; GFX1164GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v4, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: default_stratergy:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: default_stratergy:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %id.x, i32 0)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-LABEL: divergent_cfg:
 ; GFX8DAGISEL:       ; %bb.0: ; %entry
@@ -382,20 +1160,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8DAGISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX8DAGISEL-NEXT:  ; %bb.5:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -410,26 +1188,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX8GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8GISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX8GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX8GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
@@ -453,20 +1231,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9DAGISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX9DAGISEL-NEXT:  ; %bb.5:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -480,26 +1258,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX9GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9GISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX9GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX9GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -522,20 +1300,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064DAGISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1064DAGISEL-NEXT:  ; %bb.5:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1064DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -549,26 +1327,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX1064GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064GISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1064GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1064GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -591,20 +1369,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1032DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032DAGISEL-NEXT:    s_or_b32 s1, s1, s6
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1032DAGISEL-NEXT:  ; %bb.5:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1032DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -618,26 +1396,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr0
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b32 s0, s0
-; GFX1032GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX1032GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032GISEL-NEXT:    s_or_b32 s0, s0, s6
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1032GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1032GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -662,21 +1440,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1164DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164DAGISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1164DAGISEL-NEXT:  ; %bb.5:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1164DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -692,27 +1470,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
 ; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX1164GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1164GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164GISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1164GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1164GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -737,21 +1515,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1132DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_or_b32 s1, s1, s6
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1132DAGISEL-NEXT:  ; %bb.5:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1132DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -767,27 +1545,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, s0
-; GFX1132GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX1132GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_or_b32 s0, s0, s6
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1132GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1132GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
@@ -925,14 +1703,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX8DAGISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8DAGISEL-NEXT:  ; %bb.2:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -945,14 +1723,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX8GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX8GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8GISEL-NEXT:  ; %bb.2:
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -965,14 +1743,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX9DAGISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9DAGISEL-NEXT:  ; %bb.2:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -985,14 +1763,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX9GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX9GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9GISEL-NEXT:  ; %bb.2:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1005,14 +1783,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX1064DAGISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064DAGISEL-NEXT:  ; %bb.2:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1024,14 +1802,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX1064GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064GISEL-NEXT:  ; %bb.2:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1043,14 +1821,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032DAGISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032DAGISEL-NEXT:  ; %bb.2:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1062,14 +1840,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032GISEL-NEXT:  ; %bb.2:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1081,7 +1859,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -1089,7 +1867,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s6
 ; GFX1164DAGISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164DAGISEL-NEXT:  ; %bb.2:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1101,7 +1879,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -1109,7 +1887,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s6
 ; GFX1164GISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164GISEL-NEXT:  ; %bb.2:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1121,7 +1899,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -1129,7 +1907,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1140,7 +1918,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -1148,7 +1926,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132GISEL-NEXT:  ; %bb.2:
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1190,19 +1968,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX8GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX8GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX8GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1239,19 +2017,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX9GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX9GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1288,19 +2066,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX1064GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1064GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1337,19 +2115,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX1032GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1032GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1390,19 +2168,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1164GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX1164GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1164GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1441,19 +2219,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1132GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX1132GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1132GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
index f094213731684..5b2b5f19bfd58 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
@@ -478,6 +478,956 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @uniform_value_dpp(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_dpp:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_sub_i32 s3, 0, s6
+; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_dpp:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_sub_i32 s3, 0, s6
+; GFX8GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_dpp:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_sub_i32 s3, 0, s6
+; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_dpp:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_sub_i32 s3, 0, s6
+; GFX9GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value_dpp:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_clause 0x1
+; GFX1064DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_sub_i32 s3, 0, s6
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value_dpp:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_clause 0x1
+; GFX1064GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_sub_i32 s3, 0, s6
+; GFX1064GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value_dpp:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_clause 0x1
+; GFX1032DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_sub_i32 s2, 0, s2
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value_dpp:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_clause 0x1
+; GFX1032GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_sub_i32 s2, 0, s2
+; GFX1032GISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_dpp:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_sub_i32 s3, 0, s6
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_dpp:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_clause 0x1
+; GFX1164GISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_sub_i32 s3, 0, s6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_dpp:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_sub_i32 s2, 0, s2
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_dpp:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_clause 0x1
+; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_sub_i32 s2, 0, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+;
+; GFX12DAGISEL-LABEL: uniform_value_dpp:
+; GFX12DAGISEL:       ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    s_sub_co_i32 s2, 0, s2
+; GFX12DAGISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12DAGISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %in, i32 2)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_value_dpp(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_sub_i32 s2, 0, s4
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value_dpp:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8GISEL-NEXT:    s_sub_i32 s2, 0, s4
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value_dpp:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9DAGISEL-NEXT:    s_sub_i32 s2, 0, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value_dpp:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9GISEL-NEXT:    s_sub_i32 s2, 0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value_dpp:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_sub_nc_i32 v2, v1, v2
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_sub_nc_i32 v1, v1, v2
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_sub_i32 s2, 0, s4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s2
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value_dpp:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_sub_nc_i32 v2, v1, v2
+; GFX1064GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_sub_nc_i32 v1, v1, v2
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064GISEL-NEXT:    s_sub_i32 s2, 0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v4, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value_dpp:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_sub_nc_i32 v1, v1, v2
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032DAGISEL-NEXT:    s_sub_i32 s2, 0, s3
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v3, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value_dpp:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_sub_nc_i32 v1, v1, v2
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032GISEL-NEXT:    s_sub_i32 s2, 0, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value_dpp:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_sub_nc_i32 v2, v1, v2
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_sub_nc_i32 v1, v1, v2
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_sub_i32 s2, 0, s4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s2
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value_dpp:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_sub_nc_i32 v2, v1, v2
+; GFX1164GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_sub_nc_i32 v1, v1, v2
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164GISEL-NEXT:    s_sub_i32 s2, 0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v4, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value_dpp:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_sub_nc_i32 v1, v1, v2
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    s_sub_i32 s2, 0, s3
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value_dpp:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_sub_nc_i32 v1, v1, v2
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    s_sub_i32 s2, 0, s3
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+;
+; GFX12DAGISEL-LABEL: divergent_value_dpp:
+; GFX12DAGISEL:       ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX12DAGISEL-NEXT:    s_wait_dscnt 0x0
+; GFX12DAGISEL-NEXT:    v_sub_nc_i32 v1, v1, v2
+; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12DAGISEL-NEXT:    s_sub_co_i32 s2, 0, s3
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX12DAGISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %id.x, i32 2)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: default_stratergy:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_sub_i32 s2, 0, s4
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: default_stratergy:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8GISEL-NEXT:    s_sub_i32 s2, 0, s4
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: default_stratergy:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9DAGISEL-NEXT:    s_sub_i32 s2, 0, s4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: default_stratergy:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9GISEL-NEXT:    s_sub_i32 s2, 0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: default_stratergy:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_sub_nc_i32 v2, v1, v2
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_sub_nc_i32 v1, v1, v2
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_sub_i32 s2, 0, s4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s2
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: default_stratergy:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_sub_nc_i32 v2, v1, v2
+; GFX1064GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_sub_nc_i32 v1, v1, v2
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064GISEL-NEXT:    s_sub_i32 s2, 0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v4, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: default_stratergy:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_sub_nc_i32 v1, v1, v2
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032DAGISEL-NEXT:    s_sub_i32 s2, 0, s3
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v3, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: default_stratergy:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_sub_nc_i32 v1, v1, v2
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032GISEL-NEXT:    s_sub_i32 s2, 0, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: default_stratergy:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_sub_nc_i32 v2, v1, v2
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_sub_nc_i32 v1, v1, v2
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_sub_i32 s2, 0, s4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s2
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: default_stratergy:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_sub_nc_i32 v2, v1, v2
+; GFX1164GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_sub_nc_i32 v1, v1, v2
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164GISEL-NEXT:    s_sub_i32 s2, 0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v4, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: default_stratergy:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_sub_nc_i32 v1, v1, v2
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    s_sub_i32 s2, 0, s3
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: default_stratergy:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_sub_nc_i32 v1, v1, v2
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    s_sub_i32 s2, 0, s3
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+;
+; GFX12DAGISEL-LABEL: default_stratergy:
+; GFX12DAGISEL:       ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX12DAGISEL-NEXT:    s_wait_dscnt 0x0
+; GFX12DAGISEL-NEXT:    v_sub_nc_i32 v1, v1, v2
+; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX12DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12DAGISEL-NEXT:    s_sub_co_i32 s2, 0, s3
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX12DAGISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %id.x, i32 0)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-LABEL: divergent_cfg:
 ; GFX8DAGISEL:       ; %bb.0: ; %entry
@@ -485,7 +1435,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -494,24 +1444,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
-; GFX8DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX8DAGISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8DAGISEL-NEXT:    s_sub_i32 s6, s6, s8
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX8DAGISEL-NEXT:  ; %bb.5:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -526,7 +1476,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -535,20 +1485,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX8GISEL-NEXT:    s_mul_i32 s6, s3, s2
-; GFX8GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8GISEL-NEXT:    s_sub_i32 s6, s6, s8
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX8GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX8GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
@@ -564,7 +1514,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -573,24 +1523,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
-; GFX9DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX9DAGISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9DAGISEL-NEXT:    s_sub_i32 s6, s6, s8
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX9DAGISEL-NEXT:  ; %bb.5:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -604,7 +1554,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -613,20 +1563,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX9GISEL-NEXT:    s_mul_i32 s6, s3, s2
-; GFX9GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9GISEL-NEXT:    s_sub_i32 s6, s6, s8
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX9GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX9GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -641,7 +1591,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -650,24 +1600,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
-; GFX1064DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1064DAGISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064DAGISEL-NEXT:    s_sub_i32 s6, s6, s8
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1064DAGISEL-NEXT:  ; %bb.5:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1064DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -681,7 +1631,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -690,20 +1640,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX1064GISEL-NEXT:    s_mul_i32 s6, s3, s2
-; GFX1064GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064GISEL-NEXT:    s_sub_i32 s6, s6, s8
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1064GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1064GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -718,7 +1668,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr1
 ; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032DAGISEL-NEXT:    s_load_dword s1, s[4:5], 0x2c
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -727,24 +1677,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_sub_i32 s1, 0, s1
 ; GFX1032DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
-; GFX1032DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1032DAGISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1032DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032DAGISEL-NEXT:    s_sub_i32 s1, s1, s6
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1032DAGISEL-NEXT:  ; %bb.5:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1032DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -758,7 +1708,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr0
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -767,20 +1717,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_sub_i32 s0, 0, s0
 ; GFX1032GISEL-NEXT:    s_mul_i32 s0, s0, s2
-; GFX1032GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX1032GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032GISEL-NEXT:    s_sub_i32 s0, s0, s6
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1032GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1032GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -797,7 +1747,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164DAGISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -807,25 +1757,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
-; GFX1164DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1164DAGISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1164DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164DAGISEL-NEXT:    s_sub_i32 s6, s6, s8
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1164DAGISEL-NEXT:  ; %bb.5:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1164DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -841,7 +1791,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -851,21 +1801,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX1164GISEL-NEXT:    s_mul_i32 s6, s3, s2
-; GFX1164GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1164GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164GISEL-NEXT:    s_sub_i32 s6, s6, s8
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1164GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1164GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -882,7 +1832,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132DAGISEL-NEXT:    s_load_b32 s1, s[4:5], 0x2c
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -892,25 +1842,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_sub_i32 s1, 0, s1
 ; GFX1132DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
-; GFX1132DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1132DAGISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1132DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_sub_i32 s1, s1, s6
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1132DAGISEL-NEXT:  ; %bb.5:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1132DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -926,7 +1876,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -936,21 +1886,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_sub_i32 s0, 0, s0
 ; GFX1132GISEL-NEXT:    s_mul_i32 s0, s0, s2
-; GFX1132GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX1132GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_sub_i32 s0, s0, s6
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1132GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1132GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
@@ -966,7 +1916,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX12DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX12DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX12DAGISEL-NEXT:    s_load_b32 s1, s[4:5], 0x2c
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -976,15 +1926,15 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12DAGISEL-NEXT:    s_sub_co_i32 s1, 0, s1
 ; GFX12DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
-; GFX12DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX12DAGISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX12DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX12DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
@@ -992,10 +1942,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX12DAGISEL-NEXT:    s_sub_co_i32 s1, s1, s6
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX12DAGISEL-NEXT:  ; %bb.5:
 ; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX12DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX12DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1300,7 +2250,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -1308,7 +2258,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s8
 ; GFX8DAGISEL-NEXT:    s_subb_u32 s5, s5, s10
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8DAGISEL-NEXT:  ; %bb.2:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1321,7 +2271,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX8GISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -1329,7 +2279,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s8
 ; GFX8GISEL-NEXT:    s_subb_u32 s5, s5, s10
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8GISEL-NEXT:  ; %bb.2:
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1342,7 +2292,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -1350,7 +2300,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s8
 ; GFX9DAGISEL-NEXT:    s_subb_u32 s5, s5, s10
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9DAGISEL-NEXT:  ; %bb.2:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1363,7 +2313,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX9GISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -1371,7 +2321,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s8
 ; GFX9GISEL-NEXT:    s_subb_u32 s5, s5, s10
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9GISEL-NEXT:  ; %bb.2:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1384,7 +2334,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -1392,7 +2342,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064DAGISEL-NEXT:    s_sub_u32 s4, s4, s9
 ; GFX1064DAGISEL-NEXT:    s_subb_u32 s5, s5, s10
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064DAGISEL-NEXT:  ; %bb.2:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1404,7 +2354,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s8, s[6:7]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v2, s8
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s10, v3, s8
@@ -1412,7 +2362,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064GISEL-NEXT:    s_sub_u32 s4, s4, s9
 ; GFX1064GISEL-NEXT:    s_subb_u32 s5, s5, s10
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064GISEL-NEXT:  ; %bb.2:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1424,7 +2374,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
@@ -1432,7 +2382,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032DAGISEL-NEXT:    s_sub_u32 s4, s4, s8
 ; GFX1032DAGISEL-NEXT:    s_subb_u32 s5, s5, s9
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032DAGISEL-NEXT:  ; %bb.2:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1444,7 +2394,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
@@ -1452,7 +2402,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032GISEL-NEXT:    s_sub_u32 s4, s4, s8
 ; GFX1032GISEL-NEXT:    s_subb_u32 s5, s5, s9
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032GISEL-NEXT:  ; %bb.2:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1464,7 +2414,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s4, s[2:3]
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s5, v2, s4
@@ -1473,7 +2423,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_sub_u32 s0, s0, s5
 ; GFX1164DAGISEL-NEXT:    s_subb_u32 s1, s1, s6
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164DAGISEL-NEXT:  ; %bb.2:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1485,7 +2435,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s4, s[2:3]
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s5, v2, s4
@@ -1494,7 +2444,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_sub_u32 s0, s0, s5
 ; GFX1164GISEL-NEXT:    s_subb_u32 s1, s1, s6
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164GISEL-NEXT:  ; %bb.2:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1506,7 +2456,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -1515,7 +2465,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_sub_u32 s0, s0, s4
 ; GFX1132DAGISEL-NEXT:    s_subb_u32 s1, s1, s5
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1526,7 +2476,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -1535,7 +2485,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_sub_u32 s0, s0, s4
 ; GFX1132GISEL-NEXT:    s_subb_u32 s1, s1, s5
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132GISEL-NEXT:  ; %bb.2:
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1550,7 +2500,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX12DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
@@ -1560,7 +2510,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX12DAGISEL-NEXT:    s_sub_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX12DAGISEL-NEXT:  ; %bb.2:
 ; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
@@ -1581,7 +2531,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -1594,13 +2544,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8DAGISEL-NEXT:    s_mul_i32 s3, s3, s7
 ; GFX8DAGISEL-NEXT:    s_add_u32 s2, s2, s3
 ; GFX8DAGISEL-NEXT:    s_add_u32 s7, s2, s10
-; GFX8DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX8DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -1614,7 +2564,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8DAGISEL-NEXT:    s_add_u32 s7, s4, s8
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8DAGISEL-NEXT:  .LBB5_4: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1628,7 +2578,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -1641,10 +2591,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    s_mul_i32 s3, s3, s7
 ; GFX8GISEL-NEXT:    s_add_u32 s2, s2, s3
 ; GFX8GISEL-NEXT:    s_add_u32 s7, s2, s10
-; GFX8GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
@@ -1658,7 +2608,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    s_mul_i32 s5, s5, s7
 ; GFX8GISEL-NEXT:    s_add_u32 s4, s4, s5
 ; GFX8GISEL-NEXT:    s_add_u32 s7, s4, s8
-; GFX8GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX8GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1675,7 +2625,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
@@ -1688,13 +2638,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s3, s3, s5
 ; GFX9DAGISEL-NEXT:    s_add_u32 s2, s2, s3
 ; GFX9DAGISEL-NEXT:    s_add_u32 s5, s2, s10
-; GFX9DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX9DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
@@ -1708,7 +2658,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9DAGISEL-NEXT:    s_add_u32 s5, s5, s8
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9DAGISEL-NEXT:  .LBB5_4: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9DAGISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -1721,7 +2671,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -1734,10 +2684,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    s_mul_i32 s3, s3, s7
 ; GFX9GISEL-NEXT:    s_add_u32 s2, s2, s3
 ; GFX9GISEL-NEXT:    s_add_u32 s7, s2, s10
-; GFX9GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
@@ -1751,7 +2701,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    s_mul_i32 s5, s8, s5
 ; GFX9GISEL-NEXT:    s_add_u32 s4, s7, s4
 ; GFX9GISEL-NEXT:    s_add_u32 s7, s4, s5
-; GFX9GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX9GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1768,7 +2718,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr8_sgpr9
 ; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
@@ -1781,7 +2731,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064DAGISEL-NEXT:    s_add_u32 s3, s10, s3
 ; GFX1064DAGISEL-NEXT:    s_mul_i32 s8, s2, s8
 ; GFX1064DAGISEL-NEXT:    s_add_u32 s9, s3, s9
-; GFX1064DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1064DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[4:5]
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s8
@@ -1813,7 +2763,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -1826,10 +2776,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    s_add_u32 s3, s10, s3
 ; GFX1064GISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX1064GISEL-NEXT:    s_add_u32 s7, s3, s7
-; GFX1064GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
@@ -1843,7 +2793,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    s_add_u32 s7, s8, s7
 ; GFX1064GISEL-NEXT:    s_mul_i32 s6, s6, s4
 ; GFX1064GISEL-NEXT:    s_add_u32 s7, s7, s5
-; GFX1064GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1064GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1860,7 +2810,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
@@ -1873,7 +2823,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032DAGISEL-NEXT:    s_add_u32 s3, s9, s3
 ; GFX1032DAGISEL-NEXT:    s_mul_i32 s4, s2, s4
 ; GFX1032DAGISEL-NEXT:    s_add_u32 s5, s3, s5
-; GFX1032DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1032DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
@@ -1905,7 +2855,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s6, s6
@@ -1918,10 +2868,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    s_add_u32 s3, s9, s3
 ; GFX1032GISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX1032GISEL-NEXT:    s_add_u32 s7, s3, s7
-; GFX1032GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
@@ -1935,7 +2885,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    s_add_u32 s5, s5, s7
 ; GFX1032GISEL-NEXT:    s_mul_i32 s6, s6, s3
 ; GFX1032GISEL-NEXT:    s_add_u32 s7, s5, s4
-; GFX1032GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1032GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1954,7 +2904,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -1969,7 +2919,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164DAGISEL-NEXT:    s_add_u32 s3, s10, s3
 ; GFX1164DAGISEL-NEXT:    s_mul_i32 s8, s2, s8
 ; GFX1164DAGISEL-NEXT:    s_add_u32 s9, s3, s9
-; GFX1164DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1164DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[6:7]
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s8
@@ -2005,7 +2955,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2020,10 +2970,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_add_u32 s3, s10, s3
 ; GFX1164GISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX1164GISEL-NEXT:    s_add_u32 s7, s3, s7
-; GFX1164GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
@@ -2039,7 +2989,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_add_u32 s5, s8, s5
 ; GFX1164GISEL-NEXT:    s_mul_i32 s6, s4, s6
 ; GFX1164GISEL-NEXT:    s_add_u32 s7, s5, s7
-; GFX1164GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1164GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -2058,7 +3008,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2073,7 +3023,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132DAGISEL-NEXT:    s_add_u32 s3, s9, s3
 ; GFX1132DAGISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX1132DAGISEL-NEXT:    s_add_u32 s7, s3, s7
-; GFX1132DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1132DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
@@ -2108,7 +3058,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2123,10 +3073,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_add_u32 s3, s9, s3
 ; GFX1132GISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX1132GISEL-NEXT:    s_add_u32 s7, s3, s7
-; GFX1132GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
@@ -2142,7 +3092,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_add_u32 s5, s7, s5
 ; GFX1132GISEL-NEXT:    s_mul_i32 s6, s4, s3
 ; GFX1132GISEL-NEXT:    s_add_u32 s7, s5, s8
-; GFX1132GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1132GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
@@ -2160,7 +3110,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX12DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX12DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX12DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX12DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2175,7 +3125,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX12DAGISEL-NEXT:    s_add_co_u32 s3, s9, s3
 ; GFX12DAGISEL-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX12DAGISEL-NEXT:    s_add_co_u32 s7, s3, s7
-; GFX12DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX12DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
index 54c8e2e248f57..0af93a54fb52d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
@@ -368,6 +368,784 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @uniform_value_dpp(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_dpp:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_dpp:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_dpp:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_dpp:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value_dpp:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_clause 0x1
+; GFX10DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value_dpp:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_clause 0x1
+; GFX10GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_dpp:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_dpp:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_clause 0x1
+; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_dpp:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_dpp:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_clause 0x1
+; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 2)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_value_dpp(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value_dpp:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value_dpp:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value_dpp:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value_dpp:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_max_u32_e32 v2, v1, v2
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value_dpp:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_max_u32_e32 v2, v1, v2
+; GFX1064GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v4, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value_dpp:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v3, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value_dpp:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value_dpp:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_max_u32_e32 v2, v1, v2
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value_dpp:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_max_u32_e32 v2, v1, v2
+; GFX1164GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v4, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value_dpp:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value_dpp:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %id.x, i32 2)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: default_stratergy:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: default_stratergy:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: default_stratergy:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: default_stratergy:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: default_stratergy:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_max_u32_e32 v2, v1, v2
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: default_stratergy:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_max_u32_e32 v2, v1, v2
+; GFX1064GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v4, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: default_stratergy:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v3, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: default_stratergy:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: default_stratergy:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_max_u32_e32 v2, v1, v2
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: default_stratergy:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_max_u32_e32 v2, v1, v2
+; GFX1164GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v4, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: default_stratergy:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: default_stratergy:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %id.x, i32 0)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-LABEL: divergent_cfg:
 ; GFX8DAGISEL:       ; %bb.0: ; %entry
@@ -383,20 +1161,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8DAGISEL-NEXT:    s_max_u32 s6, s6, s8
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX8DAGISEL-NEXT:  ; %bb.5:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -411,26 +1189,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX8GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8GISEL-NEXT:    s_max_u32 s6, s6, s8
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX8GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX8GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
@@ -454,20 +1232,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9DAGISEL-NEXT:    s_max_u32 s6, s6, s8
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX9DAGISEL-NEXT:  ; %bb.5:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -481,26 +1259,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX9GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9GISEL-NEXT:    s_max_u32 s6, s6, s8
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX9GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX9GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -523,20 +1301,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064DAGISEL-NEXT:    s_max_u32 s6, s6, s8
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1064DAGISEL-NEXT:  ; %bb.5:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1064DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -550,26 +1328,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX1064GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064GISEL-NEXT:    s_max_u32 s6, s6, s8
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1064GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1064GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -592,20 +1370,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1032DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032DAGISEL-NEXT:    s_max_u32 s1, s1, s6
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1032DAGISEL-NEXT:  ; %bb.5:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1032DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -619,26 +1397,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr0
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b32 s0, s0
-; GFX1032GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX1032GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032GISEL-NEXT:    s_max_u32 s0, s0, s6
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1032GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1032GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -663,21 +1441,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1164DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164DAGISEL-NEXT:    s_max_u32 s6, s6, s8
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1164DAGISEL-NEXT:  ; %bb.5:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1164DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -693,27 +1471,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
 ; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX1164GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1164GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164GISEL-NEXT:    s_max_u32 s6, s6, s8
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1164GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1164GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -738,21 +1516,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1132DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_max_u32 s1, s1, s6
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1132DAGISEL-NEXT:  ; %bb.5:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1132DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -768,27 +1546,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, s0
-; GFX1132GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX1132GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_max_u32 s0, s0, s6
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1132GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1132GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
@@ -1066,7 +1844,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1077,7 +1855,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX8DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX8DAGISEL-NEXT:  ; %bb.2:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1090,7 +1868,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1101,7 +1879,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX8GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX8GISEL-NEXT:  ; %bb.2:
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1114,7 +1892,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1125,7 +1903,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX9DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX9DAGISEL-NEXT:  ; %bb.2:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1138,7 +1916,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1149,7 +1927,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX9GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX9GISEL-NEXT:  ; %bb.2:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1162,7 +1940,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1173,7 +1951,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX1064DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1064DAGISEL-NEXT:  ; %bb.2:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1185,7 +1963,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1196,7 +1974,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX1064GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1064GISEL-NEXT:  ; %bb.2:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1208,7 +1986,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1219,7 +1997,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1032DAGISEL-NEXT:  ; %bb.2:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1231,7 +2009,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1242,7 +2020,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1032GISEL-NEXT:  ; %bb.2:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1254,7 +2032,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s0
@@ -1266,7 +2044,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s8
 ; GFX1164DAGISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1164DAGISEL-NEXT:  ; %bb.2:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1278,7 +2056,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, s0
@@ -1290,7 +2068,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s8
 ; GFX1164GISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1164GISEL-NEXT:  ; %bb.2:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1302,7 +2080,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
@@ -1313,7 +2091,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1324,7 +2102,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
@@ -1335,7 +2113,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1132GISEL-NEXT:  ; %bb.2:
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1376,19 +2154,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB7_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX8GISEL-NEXT:  .LBB7_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB10_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB7_4
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB10_4
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX8GISEL-NEXT:  .LBB7_4: ; %endif
+; GFX8GISEL-NEXT:  .LBB10_4: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1425,19 +2203,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB7_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9GISEL-NEXT:  .LBB7_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB10_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB7_4
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB10_4
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX9GISEL-NEXT:  .LBB7_4: ; %endif
+; GFX9GISEL-NEXT:  .LBB10_4: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1474,19 +2252,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB7_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064GISEL-NEXT:  .LBB7_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB10_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB7_4
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB10_4
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX1064GISEL-NEXT:  .LBB7_4: ; %endif
+; GFX1064GISEL-NEXT:  .LBB10_4: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1523,19 +2301,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB7_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032GISEL-NEXT:  .LBB7_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB10_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB7_4
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB10_4
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX1032GISEL-NEXT:  .LBB7_4: ; %endif
+; GFX1032GISEL-NEXT:  .LBB10_4: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1576,19 +2354,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB7_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1164GISEL-NEXT:  .LBB7_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB10_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB7_4
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB10_4
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX1164GISEL-NEXT:  .LBB7_4: ; %endif
+; GFX1164GISEL-NEXT:  .LBB10_4: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1627,19 +2405,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB7_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1132GISEL-NEXT:  .LBB7_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB10_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB7_4
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB10_4
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX1132GISEL-NEXT:  .LBB7_4: ; %endif
+; GFX1132GISEL-NEXT:  .LBB10_4: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
index 502ef84449751..c3bac76d78450 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
@@ -368,6 +368,784 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @uniform_value_dpp(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_dpp:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_dpp:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_dpp:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_dpp:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value_dpp:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_clause 0x1
+; GFX10DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value_dpp:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_clause 0x1
+; GFX10GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_dpp:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_dpp:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_clause 0x1
+; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_dpp:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_dpp:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_clause 0x1
+; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.umin.i32(i32 %in, i32 2)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_value_dpp(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[2:3]
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value_dpp:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[2:3]
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value_dpp:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[2:3]
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value_dpp:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[2:3]
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value_dpp:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_min_u32_e32 v2, v1, v2
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value_dpp:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_min_u32_e32 v2, v1, v2
+; GFX1064GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v4, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value_dpp:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
+; GFX1032DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v3, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value_dpp:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
+; GFX1032GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value_dpp:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_min_u32_e32 v2, v1, v2
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value_dpp:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_min_u32_e32 v2, v1, v2
+; GFX1164GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v4, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value_dpp:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value_dpp:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.umin.i32(i32 %id.x, i32 2)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: default_stratergy:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[2:3]
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: default_stratergy:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[2:3]
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: default_stratergy:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[2:3]
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: default_stratergy:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[2:3]
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: default_stratergy:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_min_u32_e32 v2, v1, v2
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: default_stratergy:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_min_u32_e32 v2, v1, v2
+; GFX1064GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v4, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: default_stratergy:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
+; GFX1032DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v3, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: default_stratergy:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
+; GFX1032GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: default_stratergy:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_min_u32_e32 v2, v1, v2
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: default_stratergy:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_min_u32_e32 v2, v1, v2
+; GFX1164GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v4, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: default_stratergy:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: default_stratergy:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.umin.i32(i32 %id.x, i32 0)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-LABEL: divergent_cfg:
 ; GFX8DAGISEL:       ; %bb.0: ; %entry
@@ -383,20 +1161,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s6, -1
-; GFX8DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8DAGISEL-NEXT:    s_min_u32 s6, s6, s8
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX8DAGISEL-NEXT:  ; %bb.5:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -411,26 +1189,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX8GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX8GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8GISEL-NEXT:    s_min_u32 s6, s6, s8
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX8GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX8GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
@@ -454,20 +1232,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s6, -1
-; GFX9DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9DAGISEL-NEXT:    s_min_u32 s6, s6, s8
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX9DAGISEL-NEXT:  ; %bb.5:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -481,26 +1259,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX9GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX9GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9GISEL-NEXT:    s_min_u32 s6, s6, s8
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX9GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX9GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -523,20 +1301,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, -1
-; GFX1064DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064DAGISEL-NEXT:    s_min_u32 s6, s6, s8
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1064DAGISEL-NEXT:  ; %bb.5:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1064DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -550,26 +1328,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX1064GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX1064GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064GISEL-NEXT:    s_min_u32 s6, s6, s8
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1064GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1064GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -592,20 +1370,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s1, -1
-; GFX1032DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032DAGISEL-NEXT:    s_min_u32 s1, s1, s6
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1032DAGISEL-NEXT:  ; %bb.5:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1032DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -619,26 +1397,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr0
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b32 s0, s0
-; GFX1032GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032GISEL-NEXT:    s_mov_b32 s0, -1
-; GFX1032GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032GISEL-NEXT:    s_min_u32 s0, s0, s6
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1032GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1032GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -663,21 +1441,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s6, -1
-; GFX1164DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164DAGISEL-NEXT:    s_min_u32 s6, s6, s8
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1164DAGISEL-NEXT:  ; %bb.5:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1164DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -693,27 +1471,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
 ; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX1164GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX1164GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164GISEL-NEXT:    s_min_u32 s6, s6, s8
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1164GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1164GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -738,21 +1516,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, -1
-; GFX1132DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_min_u32 s1, s1, s6
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1132DAGISEL-NEXT:  ; %bb.5:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1132DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -768,27 +1546,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, s0
-; GFX1132GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, -1
-; GFX1132GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_min_u32 s0, s0, s6
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1132GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1132GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
@@ -926,7 +1704,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -937,7 +1715,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX8DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8DAGISEL-NEXT:  ; %bb.2:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -950,7 +1728,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -961,7 +1739,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX8GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8GISEL-NEXT:  ; %bb.2:
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -974,7 +1752,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -985,7 +1763,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX9DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9DAGISEL-NEXT:  ; %bb.2:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -998,7 +1776,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1009,7 +1787,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX9GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9GISEL-NEXT:  ; %bb.2:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1022,7 +1800,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1033,7 +1811,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX1064DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064DAGISEL-NEXT:  ; %bb.2:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1045,7 +1823,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s12, s[6:7]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1056,7 +1834,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s12
 ; GFX1064GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064GISEL-NEXT:  ; %bb.2:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1068,7 +1846,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1079,7 +1857,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032DAGISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032DAGISEL-NEXT:  ; %bb.2:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1091,7 +1869,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1102,7 +1880,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032GISEL-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032GISEL-NEXT:  ; %bb.2:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1114,7 +1892,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s0
@@ -1126,7 +1904,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s8
 ; GFX1164DAGISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164DAGISEL-NEXT:  ; %bb.2:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1138,7 +1916,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, s0
@@ -1150,7 +1928,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s8
 ; GFX1164GISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164GISEL-NEXT:  ; %bb.2:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1162,7 +1940,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
@@ -1173,7 +1951,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1184,7 +1962,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
@@ -1195,7 +1973,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132GISEL-NEXT:  ; %bb.2:
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1236,19 +2014,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX8GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX8GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX8GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1285,19 +2063,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX9GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX9GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1334,19 +2112,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX1064GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1064GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1383,19 +2161,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX1032GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1032GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1436,19 +2214,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1164GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX1164GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1164GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1487,19 +2265,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1132GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX1132GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1132GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
index d5f1750c268ab..b38bfbaf96fa7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
@@ -443,6 +443,860 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @uniform_value_dpp(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_dpp:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_dpp:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_dpp:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_dpp:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value_dpp:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_clause 0x1
+; GFX1064DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value_dpp:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_clause 0x1
+; GFX1064GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value_dpp:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_clause 0x1
+; GFX1032DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT:    s_and_b32 s3, s3, 1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value_dpp:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_clause 0x1
+; GFX1032GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032GISEL-NEXT:    s_and_b32 s3, s3, 1
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_dpp:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_dpp:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_clause 0x1
+; GFX1164GISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_dpp:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT:    s_and_b32 s3, s3, 1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_dpp:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_clause 0x1
+; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_and_b32 s3, s3, 1
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %in, i32 2)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_value_dpp(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value_dpp:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value_dpp:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value_dpp:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value_dpp:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_xor_b32_e32 v2, v1, v2
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value_dpp:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_xor_b32_e32 v2, v1, v2
+; GFX1064GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v4, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value_dpp:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v3, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value_dpp:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value_dpp:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_xor_b32_e32 v2, v1, v2
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value_dpp:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_xor_b32_e32 v2, v1, v2
+; GFX1164GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v4, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value_dpp:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value_dpp:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %id.x, i32 2)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: default_stratergy:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    s_nop 1
+; GFX8DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: default_stratergy:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    s_nop 1
+; GFX8GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT:    flat_store_dword v[2:3], v0
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: default_stratergy:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    s_nop 1
+; GFX9DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: default_stratergy:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[2:3]
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    s_nop 1
+; GFX9GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: default_stratergy:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_xor_b32_e32 v2, v1, v2
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: default_stratergy:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1064GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1064GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1064GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1064GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_xor_b32_e32 v2, v1, v2
+; GFX1064GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v4, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: default_stratergy:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v3, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: default_stratergy:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: default_stratergy:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_xor_b32_e32 v2, v1, v2
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: default_stratergy:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v3, 4, v3
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_xor_b32_e32 v2, v1, v2
+; GFX1164GISEL-NEXT:    ds_permute_b32 v2, v3, v2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v4, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: default_stratergy:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: default_stratergy:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v1, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %id.x, i32 0)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-LABEL: divergent_cfg:
 ; GFX8DAGISEL:       ; %bb.0: ; %entry
@@ -450,7 +1304,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -459,24 +1313,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX8DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX8DAGISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8DAGISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX8DAGISEL-NEXT:  ; %bb.5:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -491,7 +1345,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -500,20 +1354,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mul_i32 s6, s6, s2
-; GFX8GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8GISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX8GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX8GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
@@ -529,7 +1383,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -538,24 +1392,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX9DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX9DAGISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9DAGISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX9DAGISEL-NEXT:  ; %bb.5:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -569,7 +1423,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -578,20 +1432,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mul_i32 s6, s6, s2
-; GFX9GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9GISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX9GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX9GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -606,7 +1460,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -615,24 +1469,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064DAGISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX1064DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1064DAGISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064DAGISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1064DAGISEL-NEXT:  ; %bb.5:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1064DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -646,7 +1500,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -655,20 +1509,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064GISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mul_i32 s6, s6, s2
-; GFX1064GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064GISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1064GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1064GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -683,7 +1537,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr1
 ; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032DAGISEL-NEXT:    s_load_dword s1, s[4:5], 0x2c
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -692,24 +1546,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032DAGISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
-; GFX1032DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1032DAGISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1032DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 s1, s1, s6
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1032DAGISEL-NEXT:  ; %bb.5:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1032DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -723,7 +1577,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr0
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -732,20 +1586,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mul_i32 s0, s0, s2
-; GFX1032GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX1032GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032GISEL-NEXT:    s_xor_b32 s0, s0, s6
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1032GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1032GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -762,7 +1616,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164DAGISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -772,25 +1626,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX1164DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1164DAGISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1164DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164DAGISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1164DAGISEL-NEXT:  ; %bb.5:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1164DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -806,7 +1660,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -816,21 +1670,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mul_i32 s6, s6, s2
-; GFX1164GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1164GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164GISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1164GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1164GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -847,7 +1701,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132DAGISEL-NEXT:    s_load_b32 s1, s[4:5], 0x2c
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -857,25 +1711,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
-; GFX1132DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1132DAGISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1132DAGISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 s1, s1, s6
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
 ; GFX1132DAGISEL-NEXT:  ; %bb.5:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1132DAGISEL-NEXT:  .LBB5_6: ; %endif
 ; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -891,7 +1745,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -901,21 +1755,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mul_i32 s0, s0, s2
-; GFX1132GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB5_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_5
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX1132GISEL-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_xor_b32 s0, s0, s6
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB2_4
-; GFX1132GISEL-NEXT:  .LBB2_5: ; %endif
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1132GISEL-NEXT:  .LBB5_5: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
@@ -1137,14 +1991,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX8DAGISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8DAGISEL-NEXT:  ; %bb.2:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1157,14 +2011,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX8GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX8GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8GISEL-NEXT:  ; %bb.2:
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1177,14 +2031,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX9DAGISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9DAGISEL-NEXT:  ; %bb.2:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1197,14 +2051,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX9GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX9GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9GISEL-NEXT:  ; %bb.2:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1217,14 +2071,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064DAGISEL-NEXT:  ; %bb.2:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1236,14 +2090,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064GISEL-NEXT:  ; %bb.2:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1255,14 +2109,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032DAGISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032DAGISEL-NEXT:  ; %bb.2:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1274,14 +2128,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032GISEL-NEXT:  ; %bb.2:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1293,7 +2147,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -1301,7 +2155,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s6
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164DAGISEL-NEXT:  ; %bb.2:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1313,7 +2167,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -1321,7 +2175,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s6
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164GISEL-NEXT:  ; %bb.2:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1333,7 +2187,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -1341,7 +2195,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1352,7 +2206,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -1360,7 +2214,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132GISEL-NEXT:  ; %bb.2:
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1380,7 +2234,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -1388,7 +2242,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX8DAGISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX8DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX8DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -1416,7 +2270,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -1424,10 +2278,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX8GISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX8GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
@@ -1436,7 +2290,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mul_i32 s6, s4, s7
 ; GFX8GISEL-NEXT:    s_mul_i32 s7, s5, s7
-; GFX8GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX8GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1453,7 +2307,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
@@ -1461,7 +2315,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s4, s2, s5
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s5, s3, s5
-; GFX9DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX9DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
@@ -1488,7 +2342,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -1496,10 +2350,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX9GISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX9GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
@@ -1508,7 +2362,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mul_i32 s6, s6, s4
 ; GFX9GISEL-NEXT:    s_mul_i32 s7, s7, s4
-; GFX9GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX9GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1525,7 +2379,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
@@ -1533,7 +2387,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mul_i32 s4, s2, s5
 ; GFX1064DAGISEL-NEXT:    s_mul_i32 s5, s3, s5
-; GFX1064DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1064DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
@@ -1560,7 +2414,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -1568,10 +2422,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX1064GISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1064GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
@@ -1580,7 +2434,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mul_i32 s6, s6, s4
 ; GFX1064GISEL-NEXT:    s_mul_i32 s7, s7, s4
-; GFX1064GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1064GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1597,7 +2451,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
@@ -1605,7 +2459,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mul_i32 s4, s2, s5
 ; GFX1032DAGISEL-NEXT:    s_mul_i32 s5, s3, s5
-; GFX1032DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1032DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
@@ -1632,7 +2486,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s6, s6
@@ -1640,10 +2494,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX1032GISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1032GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
@@ -1652,7 +2506,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mul_i32 s6, s6, s3
 ; GFX1032GISEL-NEXT:    s_mul_i32 s7, s7, s3
-; GFX1032GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1032GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1671,7 +2525,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -1680,7 +2534,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX1164DAGISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1164DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1164DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -1711,7 +2565,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -1720,10 +2574,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX1164GISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1164GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
@@ -1733,7 +2587,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mul_i32 s6, s4, s7
 ; GFX1164GISEL-NEXT:    s_mul_i32 s7, s5, s7
-; GFX1164GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1164GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -1752,7 +2606,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -1761,7 +2615,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX1132DAGISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1132DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1132DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
@@ -1791,7 +2645,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -1800,10 +2654,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX1132GISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1132GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
@@ -1813,7 +2667,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mul_i32 s6, s4, s3
 ; GFX1132GISEL-NEXT:    s_mul_i32 s7, s5, s3
-; GFX1132GISEL-NEXT:  .LBB5_4: ; %endif
+; GFX1132GISEL-NEXT:  .LBB8_4: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sponentry.ll b/llvm/test/CodeGen/AMDGPU/llvm.sponentry.ll
index caedcc5e44ba1..618a4c294cae5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sponentry.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sponentry.ll
@@ -317,34 +317,50 @@ define amdgpu_gfx ptr addrspace(5) @sponentry_gfx_dyn_alloc(i32 %val) #0 {
 ; DAGISEL-NEXT:    s_wait_samplecnt 0x0
 ; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
 ; DAGISEL-NEXT:    s_wait_kmcnt 0x0
-; DAGISEL-NEXT:    v_lshl_add_u32 v1, v0, 2, 15
-; DAGISEL-NEXT:    s_mov_b32 s34, s33
-; DAGISEL-NEXT:    s_mov_b32 s1, exec_lo
-; DAGISEL-NEXT:    s_mov_b32 s0, 0
+; DAGISEL-NEXT:    s_mov_b32 s2, s33
 ; DAGISEL-NEXT:    s_mov_b32 s33, s32
-; DAGISEL-NEXT:    v_and_b32_e32 v1, -16, v1
-; DAGISEL-NEXT:    s_add_co_i32 s32, s32, 16
-; DAGISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; DAGISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
+; DAGISEL-NEXT:    scratch_store_b32 off, v1, s33 offset:4
+; DAGISEL-NEXT:    scratch_store_b32 off, v2, s33 offset:8
 ; DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; DAGISEL-NEXT:    v_readlane_b32 s3, v1, s2
-; DAGISEL-NEXT:    s_bitset0_b32 s1, s2
-; DAGISEL-NEXT:    s_max_u32 s0, s0, s3
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT:    v_lshl_add_u32 v3, v0, 2, 15
+; DAGISEL-NEXT:    s_add_co_i32 s32, s32, 16
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; DAGISEL-NEXT:    v_and_b32_e32 v3, -16, v3
+; DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
 ; DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; DAGISEL-NEXT:    s_cmp_lg_u32 s1, 0
-; DAGISEL-NEXT:    s_cbranch_scc1 .LBB9_1
-; DAGISEL-NEXT:  ; %bb.2:
-; DAGISEL-NEXT:    s_mov_b32 s1, s32
+; DAGISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v3, s0
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; DAGISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; DAGISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; DAGISEL-NEXT:    s_wait_dscnt 0x0
+; DAGISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT:    v_readlane_b32 s1, v1, 31
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT:    s_mov_b32 s0, s32
 ; DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; DAGISEL-NEXT:    v_lshl_add_u32 v1, s0, 5, s1
+; DAGISEL-NEXT:    v_lshl_add_u32 v3, s1, 5, s0
 ; DAGISEL-NEXT:    s_wait_storecnt 0x0
-; DAGISEL-NEXT:    scratch_store_b32 off, v0, s1 scope:SCOPE_SYS
+; DAGISEL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; DAGISEL-NEXT:    s_wait_storecnt 0x0
 ; DAGISEL-NEXT:    v_mov_b32_e32 v0, s33
-; DAGISEL-NEXT:    v_readfirstlane_b32 s32, v1
+; DAGISEL-NEXT:    v_readfirstlane_b32 s32, v3
 ; DAGISEL-NEXT:    s_mov_b32 s32, s33
-; DAGISEL-NEXT:    s_mov_b32 s33, s34
+; DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
+; DAGISEL-NEXT:    scratch_load_b32 v1, off, s33 offset:4
+; DAGISEL-NEXT:    scratch_load_b32 v2, off, s33 offset:8
+; DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT:    s_mov_b32 s33, s2
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
 ; DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -355,34 +371,51 @@ define amdgpu_gfx ptr addrspace(5) @sponentry_gfx_dyn_alloc(i32 %val) #0 {
 ; GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GISEL-NEXT:    s_wait_kmcnt 0x0
-; GISEL-NEXT:    v_lshl_add_u32 v1, v0, 2, 15
-; GISEL-NEXT:    s_mov_b32 s34, s33
-; GISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GISEL-NEXT:    s_mov_b32 s0, 0
+; GISEL-NEXT:    s_mov_b32 s3, s33
 ; GISEL-NEXT:    s_mov_b32 s33, s32
-; GISEL-NEXT:    v_and_b32_e32 v1, -16, v1
-; GISEL-NEXT:    s_add_co_i32 s32, s32, 16
-; GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
+; GISEL-NEXT:    scratch_store_b32 off, v1, s33 offset:4
+; GISEL-NEXT:    scratch_store_b32 off, v2, s33 offset:8
 ; GISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GISEL-NEXT:    v_lshl_add_u32 v3, v0, 2, 15
+; GISEL-NEXT:    s_add_co_i32 s32, s32, 16
 ; GISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GISEL-NEXT:    v_readlane_b32 s3, v1, s2
-; GISEL-NEXT:    s_bitset0_b32 s1, s2
-; GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GISEL-NEXT:    s_mov_b32 s0, s32
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_and_b32_e32 v3, -16, v3
+; GISEL-NEXT:    s_or_saveexec_b32 s1, -1
 ; GISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GISEL-NEXT:    s_cmp_lg_u32 s1, 0
-; GISEL-NEXT:    s_cbranch_scc1 .LBB9_1
-; GISEL-NEXT:  ; %bb.2:
-; GISEL-NEXT:    s_mov_b32 s1, s32
-; GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v3, s1
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GISEL-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GISEL-NEXT:    ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
+; GISEL-NEXT:    s_wait_dscnt 0x0
+; GISEL-NEXT:    v_max_u32_e32 v1, v1, v2
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_readlane_b32 s2, v1, 31
+; GISEL-NEXT:    s_mov_b32 exec_lo, s1
+; GISEL-NEXT:    s_lshl_b32 s1, s2, 5
 ; GISEL-NEXT:    s_wait_storecnt 0x0
-; GISEL-NEXT:    scratch_store_b32 off, v0, s1 scope:SCOPE_SYS
+; GISEL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GISEL-NEXT:    s_wait_storecnt 0x0
-; GISEL-NEXT:    v_mov_b32_e32 v0, s33
 ; GISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GISEL-NEXT:    s_add_co_u32 s32, s1, s0
+; GISEL-NEXT:    s_add_co_u32 s32, s0, s1
+; GISEL-NEXT:    v_mov_b32_e32 v0, s33
 ; GISEL-NEXT:    s_mov_b32 s32, s33
-; GISEL-NEXT:    s_mov_b32 s33, s34
+; GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
+; GISEL-NEXT:    scratch_load_b32 v1, off, s33 offset:4
+; GISEL-NEXT:    scratch_load_b32 v2, off, s33 offset:8
+; GISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GISEL-NEXT:    s_mov_b32 s33, s3
+; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %local = alloca i32, i32 %val, addrspace(5)
@@ -499,34 +532,36 @@ define amdgpu_cs_chain void @sponentry_cs_chain_dyn_alloc(i32 %val) #0 {
 ; DAGISEL-NEXT:    s_wait_samplecnt 0x0
 ; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
 ; DAGISEL-NEXT:    s_wait_kmcnt 0x0
-; DAGISEL-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
-; DAGISEL-NEXT:    s_mov_b32 s1, exec_lo
-; DAGISEL-NEXT:    s_mov_b32 s0, 0
+; DAGISEL-NEXT:    v_lshl_add_u32 v2, v8, 2, 15
 ; DAGISEL-NEXT:    s_mov_b32 s33, s32
 ; DAGISEL-NEXT:    s_add_co_i32 s32, s32, 16
-; DAGISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; DAGISEL-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
-; DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; DAGISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; DAGISEL-NEXT:    v_and_b32_e32 v2, -16, v2
+; DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
 ; DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; DAGISEL-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s0
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; DAGISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
+; DAGISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; DAGISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
+; DAGISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
+; DAGISEL-NEXT:    ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15)
+; DAGISEL-NEXT:    s_wait_dscnt 0x0
+; DAGISEL-NEXT:    v_max_u32_e32 v0, v0, v1
 ; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; DAGISEL-NEXT:    v_readlane_b32 s3, v0, s2
-; DAGISEL-NEXT:    s_bitset0_b32 s1, s2
-; DAGISEL-NEXT:    s_max_u32 s0, s0, s3
-; DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; DAGISEL-NEXT:    s_cmp_lg_u32 s1, 0
-; DAGISEL-NEXT:    s_cbranch_scc1 .LBB12_1
-; DAGISEL-NEXT:  ; %bb.2:
-; DAGISEL-NEXT:    s_mov_b32 s1, s32
-; DAGISEL-NEXT:    v_mov_b32_e32 v1, s33
+; DAGISEL-NEXT:    v_readlane_b32 s1, v0, 31
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT:    s_mov_b32 s0, s32
+; DAGISEL-NEXT:    v_mov_b32_e32 v3, s33
 ; DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; DAGISEL-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; DAGISEL-NEXT:    v_lshl_add_u32 v2, s1, 5, s0
 ; DAGISEL-NEXT:    s_wait_storecnt 0x0
-; DAGISEL-NEXT:    scratch_store_b32 off, v8, s1 scope:SCOPE_SYS
+; DAGISEL-NEXT:    scratch_store_b32 off, v8, s0 scope:SCOPE_SYS
 ; DAGISEL-NEXT:    s_wait_storecnt 0x0
-; DAGISEL-NEXT:    scratch_store_b32 off, v1, s1 scope:SCOPE_SYS
+; DAGISEL-NEXT:    scratch_store_b32 off, v3, s0 scope:SCOPE_SYS
 ; DAGISEL-NEXT:    s_wait_storecnt 0x0
-; DAGISEL-NEXT:    v_readfirstlane_b32 s32, v0
+; DAGISEL-NEXT:    v_readfirstlane_b32 s32, v2
 ; DAGISEL-NEXT:    s_alloc_vgpr 0
 ; DAGISEL-NEXT:    s_endpgm
 ;
@@ -537,34 +572,36 @@ define amdgpu_cs_chain void @sponentry_cs_chain_dyn_alloc(i32 %val) #0 {
 ; GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GISEL-NEXT:    s_wait_kmcnt 0x0
-; GISEL-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
-; GISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GISEL-NEXT:    s_mov_b32 s0, 0
+; GISEL-NEXT:    v_lshl_add_u32 v2, v8, 2, 15
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_add_co_i32 s32, s32, 16
-; GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
-; GISEL-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GISEL-NEXT:    s_mov_b32 s0, s32
+; GISEL-NEXT:    v_and_b32_e32 v2, -16, v2
+; GISEL-NEXT:    s_or_saveexec_b32 s1, -1
 ; GISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL-NEXT:    v_readlane_b32 s3, v0, s2
-; GISEL-NEXT:    s_bitset0_b32 s1, s2
-; GISEL-NEXT:    s_max_u32 s0, s0, s3
-; GISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GISEL-NEXT:    s_cmp_lg_u32 s1, 0
-; GISEL-NEXT:    s_cbranch_scc1 .LBB12_1
-; GISEL-NEXT:  ; %bb.2:
-; GISEL-NEXT:    v_mov_b32_e32 v0, s33
-; GISEL-NEXT:    s_mov_b32 s1, s32
-; GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s1
+; GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
+; GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
+; GISEL-NEXT:    ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15)
+; GISEL-NEXT:    s_wait_dscnt 0x0
+; GISEL-NEXT:    v_max_u32_e32 v0, v0, v1
+; GISEL-NEXT:    v_readlane_b32 s2, v0, 31
+; GISEL-NEXT:    s_mov_b32 exec_lo, s1
+; GISEL-NEXT:    v_mov_b32_e32 v2, s33
+; GISEL-NEXT:    s_lshl_b32 s1, s2, 5
 ; GISEL-NEXT:    s_wait_storecnt 0x0
-; GISEL-NEXT:    scratch_store_b32 off, v8, s1 scope:SCOPE_SYS
-; GISEL-NEXT:    s_wait_storecnt 0x0
-; GISEL-NEXT:    scratch_store_b32 off, v0, s1 scope:SCOPE_SYS
+; GISEL-NEXT:    scratch_store_b32 off, v8, s0 scope:SCOPE_SYS
 ; GISEL-NEXT:    s_wait_storecnt 0x0
 ; GISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GISEL-NEXT:    s_add_co_u32 s32, s1, s0
+; GISEL-NEXT:    s_add_co_u32 s32, s0, s1
+; GISEL-NEXT:    scratch_store_b32 off, v2, s0 scope:SCOPE_SYS
+; GISEL-NEXT:    s_wait_storecnt 0x0
 ; GISEL-NEXT:    s_alloc_vgpr 0
 ; GISEL-NEXT:    s_endpgm
   %local = alloca i32, i32 %val, addrspace(5)



More information about the llvm-commits mailing list