[llvm] e55d6f5 - [AMDGPU] Simplify and improve codegen for llvm.amdgcn.set.inactive (#107889)

via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 11 09:16:12 PDT 2024


Author: Jay Foad
Date: 2024-09-11T17:16:06+01:00
New Revision: e55d6f5ea2656bf842973d8bee86c3ace31bc865

URL: https://github.com/llvm/llvm-project/commit/e55d6f5ea2656bf842973d8bee86c3ace31bc865
DIFF: https://github.com/llvm/llvm-project/commit/e55d6f5ea2656bf842973d8bee86c3ace31bc865.diff

LOG: [AMDGPU] Simplify and improve codegen for llvm.amdgcn.set.inactive (#107889)

Always generate v_cndmask_b32 instead of modifying exec around
v_mov_b32. This is expected to be faster because
modifying exec generally causes pipeline stalls.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.h
    llvm/lib/Target/AMDGPU/SIInstructions.td
    llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
    llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
    llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
    llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
    llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
    llvm/test/CodeGen/AMDGPU/wave32.ll
    llvm/test/CodeGen/AMDGPU/wqm.mir
    llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
    llvm/test/CodeGen/AMDGPU/wwm-reserved.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 56f4efda7925f1..e657f668cc656a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5439,6 +5439,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
 
   bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
                       IID == Intrinsic::amdgcn_permlanex16;
+  bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
+                       IID == Intrinsic::amdgcn_set_inactive_chain_arg;
 
   auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
                                       Register Src2, LLT VT) -> Register {
@@ -5448,6 +5450,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
     case Intrinsic::amdgcn_permlane64:
       return LaneOp.getReg(0);
     case Intrinsic::amdgcn_readlane:
+    case Intrinsic::amdgcn_set_inactive:
+    case Intrinsic::amdgcn_set_inactive_chain_arg:
       return LaneOp.addUse(Src1).getReg(0);
     case Intrinsic::amdgcn_writelane:
       return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
@@ -5472,7 +5476,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   Register Src0 = MI.getOperand(2).getReg();
   Register Src1, Src2;
   if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
-      IsPermLane16) {
+      IsSetInactive || IsPermLane16) {
     Src1 = MI.getOperand(3).getReg();
     if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
       Src2 = MI.getOperand(4).getReg();
@@ -5490,7 +5494,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   if (Size < 32) {
     Src0 = B.buildAnyExt(S32, Src0).getReg(0);
 
-    if (IsPermLane16)
+    if (IsSetInactive || IsPermLane16)
       Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
 
     if (IID == Intrinsic::amdgcn_writelane)
@@ -5526,7 +5530,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
   MachineInstrBuilder Src1Parts, Src2Parts;
 
-  if (IsPermLane16)
+  if (IsSetInactive || IsPermLane16)
     Src1Parts = B.buildUnmerge(PartialResTy, Src1);
 
   if (IID == Intrinsic::amdgcn_writelane)
@@ -5535,7 +5539,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   for (unsigned i = 0; i < NumParts; ++i) {
     Src0 = Src0Parts.getReg(i);
 
-    if (IsPermLane16)
+    if (IsSetInactive || IsPermLane16)
       Src1 = Src1Parts.getReg(i);
 
     if (IID == Intrinsic::amdgcn_writelane)
@@ -7496,6 +7500,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_permlane16:
   case Intrinsic::amdgcn_permlanex16:
   case Intrinsic::amdgcn_permlane64:
+  case Intrinsic::amdgcn_set_inactive:
+  case Intrinsic::amdgcn_set_inactive_chain_arg:
     return legalizeLaneOp(Helper, MI, IntrID);
   case Intrinsic::amdgcn_s_buffer_prefetch_data:
     return legalizeSBufferPrefetch(Helper, MI);

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9a6d617e6ee4cf..62e22c15af168e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6102,6 +6102,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
   unsigned IID = N->getConstantOperandVal(0);
   bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
                       IID == Intrinsic::amdgcn_permlanex16;
+  bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
+                       IID == Intrinsic::amdgcn_set_inactive_chain_arg;
   SDLoc SL(N);
   MVT IntVT = MVT::getIntegerVT(ValSize);
 
@@ -6119,6 +6121,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
       Operands.push_back(Src2);
       [[fallthrough]];
     case Intrinsic::amdgcn_readlane:
+    case Intrinsic::amdgcn_set_inactive:
+    case Intrinsic::amdgcn_set_inactive_chain_arg:
       Operands.push_back(Src1);
       [[fallthrough]];
     case Intrinsic::amdgcn_readfirstlane:
@@ -6145,7 +6149,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
   SDValue Src0 = N->getOperand(1);
   SDValue Src1, Src2;
   if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
-      IsPermLane16) {
+      IsSetInactive || IsPermLane16) {
     Src1 = N->getOperand(2);
     if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
       Src2 = N->getOperand(3);
@@ -6161,7 +6165,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
     Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
                                 SL, MVT::i32);
 
-    if (IsPermLane16) {
+    if (IsSetInactive || IsPermLane16) {
       Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
                                   SL, MVT::i32);
     }
@@ -6237,7 +6241,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
         Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
                                  DAG.getConstant(EltIdx, SL, MVT::i32));
 
-        if (IsPermLane16)
+        if (IsSetInactive || IsPermLane16)
           Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
                                    DAG.getConstant(EltIdx, SL, MVT::i32));
 
@@ -6246,7 +6250,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
                                    DAG.getConstant(EltIdx, SL, MVT::i32));
 
         Pieces.push_back(
-            IsPermLane16
+            IsSetInactive || IsPermLane16
                 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
                 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
         EltIdx += 2;
@@ -6262,7 +6266,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
   MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
   Src0 = DAG.getBitcast(VecVT, Src0);
 
-  if (IsPermLane16)
+  if (IsSetInactive || IsPermLane16)
     Src1 = DAG.getBitcast(VecVT, Src1);
 
   if (IID == Intrinsic::amdgcn_writelane)
@@ -8745,6 +8749,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_permlane16:
   case Intrinsic::amdgcn_permlanex16:
   case Intrinsic::amdgcn_permlane64:
+  case Intrinsic::amdgcn_set_inactive:
+  case Intrinsic::amdgcn_set_inactive_chain_arg:
     return lowerLaneOp(*this, Op.getNode(), DAG);
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 899d937fb296a1..e4a679f6a3ef8f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2097,21 +2097,6 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
   }
 }
 
-Register SIInstrInfo::findSetInactiveMask(const MachineInstr &MI) {
-  assert(MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
-         MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64);
-  for (auto &Op : MI.implicit_operands()) {
-    if (Op.isDef())
-      continue;
-    Register OpReg = Op.getReg();
-    if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
-        OpReg == AMDGPU::SCC)
-      continue;
-    return OpReg;
-  }
-  return Register();
-}
-
 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MBB.findDebugLoc(MI);
@@ -2286,147 +2271,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.eraseFromParent();
     break;
   }
-  case AMDGPU::V_SET_INACTIVE_B32:
-  case AMDGPU::V_SET_INACTIVE_B64: {
-    unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
-    unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-    unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
-                           ? AMDGPU::V_MOV_B64_PSEUDO
-                           : AMDGPU::V_MOV_B32_e32;
-    Register ExecReg = RI.getExec();
+  case AMDGPU::V_SET_INACTIVE_B32: {
+    // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
     Register DstReg = MI.getOperand(0).getReg();
-    MachineOperand &ActiveSrc = MI.getOperand(1);
-    MachineOperand &InactiveSrc = MI.getOperand(2);
-
-    // Find implicit register defining lanes active outside WWM.
-    Register ExecSrcReg = findSetInactiveMask(MI);
-    assert(ExecSrcReg && "V_SET_INACTIVE must be in known WWM region");
-    // Note: default here is set to ExecReg so that functional MIR is still
-    // generated if implicit def is not found and assertions are disabled.
-    if (!ExecSrcReg)
-      ExecSrcReg = ExecReg;
-
-    // Ideally in WWM this operation is lowered to V_CNDMASK; however,
-    // constant bus constraints and the presence of literal constants
-    // present an issue.
-    // Fallback to V_MOV base lowering in all but the common cases.
-    const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
-    MachineFunction *MF = MBB.getParent();
-    MachineRegisterInfo &MRI = MF->getRegInfo();
-    const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
-    const MCInstrDesc &Desc = get(Opcode);
-
-    const APInt ActiveImm(64, ActiveSrc.isImm() ? ActiveSrc.getImm() : 0);
-    const APInt InactiveImm(64, InactiveSrc.isImm() ? InactiveSrc.getImm() : 0);
-    const APInt ActiveImmLo(32, ActiveImm.getLoBits(32).getZExtValue());
-    const APInt ActiveImmHi(32, ActiveImm.getHiBits(32).getZExtValue());
-    const APInt InactiveImmLo(32, InactiveImm.getLoBits(32).getZExtValue());
-    const APInt InactiveImmHi(32, InactiveImm.getHiBits(32).getZExtValue());
-
-    int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
-    int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
-
-    int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
-    int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
-    int ConstantBusUses =
-        1 + // Starts at 1 for ExecSrcReg
-        (usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0) +
-        (usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0);
-    int LiteralConstants =
-        ((ActiveSrc.isReg() ||
-          (ActiveSrc.isImm() && isInlineConstant(ActiveImm)))
-             ? 0
-             : 1) +
-        ((InactiveSrc.isReg() ||
-          (InactiveSrc.isImm() && isInlineConstant(InactiveImm)))
-             ? 0
-             : 1);
-
-    bool UseVCndMask =
-        ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit;
-    if (VMov64 && UseVCndMask) {
-      // Decomposition must not introduce new literals.
-      UseVCndMask &=
-          ActiveSrc.isReg() ||
-          (isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmHi)) ||
-          (!isInlineConstant(ActiveImm));
-      UseVCndMask &= InactiveSrc.isReg() ||
-                     (isInlineConstant(InactiveImmLo) &&
-                      isInlineConstant(InactiveImmHi)) ||
-                     (!isInlineConstant(InactiveImm));
-    }
-
-    if (UseVCndMask && VMov64) {
-      // Dual V_CNDMASK_B32
-      MachineOperand ActiveLo = buildExtractSubRegOrImm(
-          MI, MRI, ActiveSrc, nullptr, AMDGPU::sub0, nullptr);
-      MachineOperand ActiveHi = buildExtractSubRegOrImm(
-          MI, MRI, ActiveSrc, nullptr, AMDGPU::sub1, nullptr);
-      MachineOperand InactiveLo = buildExtractSubRegOrImm(
-          MI, MRI, InactiveSrc, nullptr, AMDGPU::sub0, nullptr);
-      MachineOperand InactiveHi = buildExtractSubRegOrImm(
-          MI, MRI, InactiveSrc, nullptr, AMDGPU::sub1, nullptr);
-      if (ActiveSrc.isReg())
-        ActiveHi.setIsKill(ActiveSrc.isKill());
-      if (InactiveSrc.isReg())
-        InactiveHi.setIsKill(InactiveSrc.isKill());
-      BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub0))
-          .addImm(0)
-          .add(InactiveLo)
-          .addImm(0)
-          .add(ActiveLo)
-          .addReg(ExecSrcReg)
-          .addReg(DstReg, RegState::ImplicitDefine);
-      BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub1))
-          .addImm(0)
-          .add(InactiveHi)
-          .addImm(0)
-          .add(ActiveHi)
-          .addReg(ExecSrcReg)
-          .addReg(DstReg, RegState::ImplicitDefine);
-    } else if (UseVCndMask) {
-      // Single V_CNDMASK_B32
-      BuildMI(MBB, MI, DL, Desc, DstReg)
-          .addImm(0)
-          .add(InactiveSrc)
-          .addImm(0)
-          .add(ActiveSrc)
-          .addReg(ExecSrcReg);
-    } else {
-      // Fallback V_MOV case.
-      // Avoid unnecessary work if a source VGPR is also the destination.
-      // This can happen if WWM register allocation was efficient.
-      // Note: this assumes WWM execution.
-      bool DstIsActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg;
-      bool DstIsInactive =
-          InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg;
-      if (!DstIsInactive) {
-        // Set exec mask to inactive lanes,
-        // but only if active lanes would be overwritten.
-        if (DstIsActive) {
-          BuildMI(MBB, MI, DL, get(NotOpc), ExecReg)
-              .addReg(ExecSrcReg)
-              .setOperandDead(3); // Dead scc
-        }
-        // Copy inactive lanes
-        MachineInstr *VMov =
-            BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc);
-        if (VMov64)
-          expandPostRAPseudo(*VMov);
-      }
-      if (!DstIsActive) {
-        // Set exec mask to active lanes
-        BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg);
-        // Copy active lanes
-        MachineInstr *VMov =
-            BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg())
-                .add(ActiveSrc);
-        if (VMov64)
-          expandPostRAPseudo(*VMov);
-      }
-      // Restore WWM
-      BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1);
-    }
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+        .add(MI.getOperand(3))
+        .add(MI.getOperand(4))
+        .add(MI.getOperand(1))
+        .add(MI.getOperand(2))
+        .add(MI.getOperand(5));
     MI.eraseFromParent();
     break;
   }

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 71432510fdee4f..4fd9b4366159be 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1437,8 +1437,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   // This is used if an operand is a 32 bit register but needs to be aligned
   // regardless.
   void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const;
-
-  static Register findSetInactiveMask(const MachineInstr &MI);
 };
 
 /// \brief Returns true if a reg:subreg pair P has a TRC class

diff  --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index f9f07968165fe0..284be72886ccef 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -243,29 +243,16 @@ def : GCNPat <(f32 (fptrunc_round f64:$src0, (i32 SupportedRoundMode:$round))),
 
 // Invert the exec mask and overwrite the inactive lanes of dst with inactive,
 // restoring it after we're done.
-let Defs = [SCC], isConvergent = 1 in {
-def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
-  (ins VSrc_b32: $src, VSrc_b32:$inactive), []>;
-
-def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
-  (ins VSrc_b64: $src, VSrc_b64:$inactive), []>;
-} // End Defs = [SCC]
+let isConvergent = 1 in
+def V_SET_INACTIVE_B32 : VOP3_Pseudo<"v_set_inactive_b32", VOP2e_I32_I32_I32_I1>;
 
 foreach vt = Reg32Types.types in {
 def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
-     (V_SET_INACTIVE_B32 VSrc_b32:$src, VSrc_b32:$inactive)>;
-}
-
-foreach vt = Reg64Types.types in {
-def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
-     (V_SET_INACTIVE_B64 VSrc_b64:$src, VSrc_b64:$inactive)>;
+     (V_SET_INACTIVE_B32 0, VSrc_b32:$src, 0, VSrc_b32:$inactive, (IMPLICIT_DEF))>;
 }
 
 def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
-    (V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>;
-
-def : GCNPat<(i64 (int_amdgcn_set_inactive_chain_arg i64:$src, i64:$inactive)),
-    (V_SET_INACTIVE_B64 VReg_64:$src, VReg_64:$inactive)>;
+    (V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
 
 let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
   def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),

diff  --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index 29fef49ee70954..3bf2ea0f9e53ef 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -215,8 +215,7 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
   for (MachineBasicBlock *MBB : RPOT) {
     bool InWWM = false;
     for (MachineInstr &MI : *MBB) {
-      if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
-          MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64)
+      if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32)
         RegsAssigned |= processDef(MI.getOperand(0));
 
       if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR) {

diff  --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 38ebda6cde1e59..8cedc34ca40de7 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -557,26 +557,18 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         // This avoid unnecessarily marking M0 as requiring WQM.
         III.Needs |= StateStrictWQM;
         GlobalFlags |= StateStrictWQM;
-      } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
-                 Opcode == AMDGPU::V_SET_INACTIVE_B64) {
-        // Ignore these if V_SET_INACTIVE which already has exec src register.
-        // These are generated by an earlier pass which has seperately ensured
-        // WWM and provided a mask of inactive lanes.
-        Register ExecSrc = TII->findSetInactiveMask(MI);
-        if (!ExecSrc) {
-          // Disable strict states; StrictWQM will be added as required later.
-          III.Disabled = StateStrict;
-          MachineOperand &Inactive = MI.getOperand(2);
-          if (Inactive.isReg()) {
-            if (Inactive.isUndef()) {
-              LowerToCopyInstrs.insert(&MI);
-            } else {
-              markOperand(MI, Inactive, StateStrictWWM, Worklist);
-            }
-          }
-          SetInactiveInstrs.push_back(&MI);
-          BBI.NeedsLowering = true;
+      } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) {
+        // Disable strict states; StrictWQM will be added as required later.
+        III.Disabled = StateStrict;
+        MachineOperand &Inactive = MI.getOperand(4);
+        if (Inactive.isReg()) {
+          if (Inactive.isUndef() && MI.getOperand(3).getImm() == 0)
+            LowerToCopyInstrs.insert(&MI);
+          else
+            markOperand(MI, Inactive, StateStrictWWM, Worklist);
         }
+        SetInactiveInstrs.push_back(&MI);
+        BBI.NeedsLowering = true;
       } else if (TII->isDisableWQM(MI)) {
         BBI.Needs |= StateExact;
         if (!(BBI.InNeeds & StateExact)) {
@@ -1078,10 +1070,11 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
       ActiveLanesReg = 0;
       break;
     case AMDGPU::V_SET_INACTIVE_B32:
-    case AMDGPU::V_SET_INACTIVE_B64:
       if (ActiveLanesReg) {
-        MI.addOperand(*MBB.getParent(),
-                      MachineOperand::CreateReg(ActiveLanesReg, false, true));
+        LiveInterval &LI = LIS->getInterval(MI.getOperand(5).getReg());
+        MRI->constrainRegClass(ActiveLanesReg, TRI->getWaveMaskRegClass());
+        MI.getOperand(5).setReg(ActiveLanesReg);
+        LIS->shrinkToUses(&LI);
       } else {
         assert(State == StateExact || State == StateWQM);
       }
@@ -1527,15 +1520,17 @@ bool SIWholeQuadMode::lowerCopyInstrs() {
   for (MachineInstr *MI : LowerToCopyInstrs) {
     LLVM_DEBUG(dbgs() << "simplify: " << *MI);
 
-    if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
-        MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
-      assert(MI->getNumExplicitOperands() == 3);
+    if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) {
+      assert(MI->getNumExplicitOperands() == 6);
 
       LiveInterval *RecomputeLI = nullptr;
-      if (MI->getOperand(2).isReg())
-        RecomputeLI = &LIS->getInterval(MI->getOperand(2).getReg());
+      if (MI->getOperand(4).isReg())
+        RecomputeLI = &LIS->getInterval(MI->getOperand(4).getReg());
 
-      MI->removeOperand(2);
+      MI->removeOperand(5);
+      MI->removeOperand(4);
+      MI->removeOperand(3);
+      MI->removeOperand(1);
 
       if (RecomputeLI)
         LIS->shrinkToUses(RecomputeLI);
@@ -1547,12 +1542,6 @@ bool SIWholeQuadMode::lowerCopyInstrs() {
                           ? (unsigned)AMDGPU::COPY
                           : TII->getMovOpcode(TRI->getRegClassForOperandReg(
                                 *MRI, MI->getOperand(0)));
-    int Index = MI->findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
-    while (Index >= 0) {
-      MI->removeOperand(Index);
-      Index = MI->findRegisterUseOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
-    }
-
     MI->setDesc(TII->get(CopyOp));
     LLVM_DEBUG(dbgs() << " -> " << *MI);
   }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
index 9c634aba348d23..89c3bbc83018e7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
@@ -85,7 +85,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX908-NEXT:   [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
   ; GFX908-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
   ; GFX908-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
-  ; GFX908-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF]], implicit $exec
   ; GFX908-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
   ; GFX908-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
   ; GFX908-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
@@ -159,7 +160,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX90A-NEXT:   [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
   ; GFX90A-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
   ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
-  ; GFX90A-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec
+  ; GFX90A-NEXT:   [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF]], implicit $exec
   ; GFX90A-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
   ; GFX90A-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
   ; GFX90A-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
@@ -233,7 +235,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX940-NEXT:   [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
   ; GFX940-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
   ; GFX940-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
-  ; GFX940-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec
+  ; GFX940-NEXT:   [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+  ; GFX940-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF]], implicit $exec
   ; GFX940-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
   ; GFX940-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
   ; GFX940-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
@@ -300,7 +303,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX11-NEXT:   [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[COPY5]], implicit $exec
   ; GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
   ; GFX11-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; GFX11-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY6]], implicit-def dead $scc, implicit $exec
+  ; GFX11-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+  ; GFX11-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY6]], [[DEF1]], implicit $exec
   ; GFX11-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
   ; GFX11-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY7]], [[V_SET_INACTIVE_B32_]], 353, 15, 15, 0, implicit $exec
   ; GFX11-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
index fdce9d9258c887..8eef3d4ac4a3d6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
@@ -77,7 +77,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX90A-NEXT:   [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
   ; GFX90A-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
   ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
-  ; GFX90A-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec
+  ; GFX90A-NEXT:   [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF1]], implicit $exec
   ; GFX90A-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
   ; GFX90A-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
   ; GFX90A-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
@@ -170,7 +171,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX940-NEXT:   [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
   ; GFX940-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
   ; GFX940-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
-  ; GFX940-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec
+  ; GFX940-NEXT:   [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+  ; GFX940-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF1]], implicit $exec
   ; GFX940-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
   ; GFX940-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
   ; GFX940-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
@@ -255,7 +257,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[COPY5]], implicit $exec
   ; GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
   ; GFX11-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; GFX11-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY6]], implicit-def dead $scc, implicit $exec
+  ; GFX11-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+  ; GFX11-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY6]], [[DEF1]], implicit $exec
   ; GFX11-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
   ; GFX11-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY7]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
   ; GFX11-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 137366a45cbdfc..f015099517902b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -4,15 +4,13 @@
 define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
 ; GCN-LABEL: set_inactive:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 42
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 42, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -45,17 +43,15 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
 ; GCN-LABEL: set_inactive_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, v3, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
@@ -89,22 +85,18 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
 ; GCN-LABEL: set_inactive_scc:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x34
+; GCN-NEXT:    s_load_dword s8, s[2:3], 0x2c
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_buffer_load_dword s6, s[4:7], 0x0
-; GCN-NEXT:    s_load_dword s7, s[2:3], 0x2c
+; GCN-NEXT:    s_buffer_load_dword s4, s[4:7], 0x0
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s2, 1
+; GCN-NEXT:    v_mov_b32_e32 v1, s8
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 42, v1, s[2:3]
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cmp_lg_u32 s6, 56
-; GCN-NEXT:    s_cselect_b32 s3, 1, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, 42
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_cmp_lg_u32 s4, 56
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-NEXT:    s_cmp_lg_u32 s3, 0
+; GCN-NEXT:    s_mov_b32 s2, 1
 ; GCN-NEXT:    s_cbranch_scc0 .LBB4_2
 ; GCN-NEXT:  ; %bb.1: ; %.one
 ; GCN-NEXT:    v_add_u32_e32 v2, vcc, 1, v1
@@ -145,15 +137,16 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
 define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
 ; GCN-LABEL: set_inactive_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x40400000
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -174,13 +167,15 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x4010cccc
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[2:3]
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -193,15 +188,16 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
 define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
 ; GCN-LABEL: set_inactive_v2i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x10001
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -216,15 +212,16 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
 define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
 ; GCN-LABEL: set_inactive_v2f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -240,19 +237,15 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
 ; GCN-LABEL: set_inactive_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s6, 1
-; GCN-NEXT:    s_mov_b32 s7, s6
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 1, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 1, v3, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
@@ -267,19 +260,15 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
 ; GCN-LABEL: set_inactive_v2f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s6, 1.0
-; GCN-NEXT:    s_mov_b32 s7, s6
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 1.0, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
@@ -293,15 +282,16 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
 define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
 ; GCN-LABEL: set_inactive_v2bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -318,21 +308,20 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s6, 0x10001
-; GCN-NEXT:    s_mov_b32 s7, s6
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x10001
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v3, s2
+; GCN-NEXT:    v_mov_b32_e32 v4, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v3, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, v4, s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    v_mov_b32_e32 v4, v2
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[3:4], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
   %tmp = call <4 x i16> @llvm.amdgcn.strict.wwm.v4i16(<4 x i16> %tmp.0)
@@ -345,21 +334,20 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half>
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s6, 0x3c003c00
-; GCN-NEXT:    s_mov_b32 s7, s6
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v3, s2
+; GCN-NEXT:    v_mov_b32_e32 v4, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v3, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, v4, s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    v_mov_b32_e32 v4, v2
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[3:4], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
   %tmp = call <4 x half> @llvm.amdgcn.strict.wwm.v4f16(<4 x half> %tmp.0)
@@ -372,21 +360,20 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s6, 0x3f803f80
-; GCN-NEXT:    s_mov_b32 s7, s6
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v3, s2
+; GCN-NEXT:    v_mov_b32_e32 v4, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v3, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, v4, s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    v_mov_b32_e32 v4, v2
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[3:4], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
   %tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0)
@@ -398,17 +385,15 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
 ; GCN-LABEL: set_inactive_p0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, v3, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
@@ -422,15 +407,13 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
 define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
 ; GCN-LABEL: set_inactive_p2:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -445,15 +428,13 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
 ; GCN-LABEL: set_inactive_p3:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -468,15 +449,13 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
 ; GCN-LABEL: set_inactive_p5:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -491,15 +470,13 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
 ; GCN-LABEL: set_inactive_p6:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 5a8df7b84bf2f3..b17dfc7c3754a1 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2890,68 +2890,65 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_DPP-LABEL: add_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
 ; GFX8_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[4:5]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[4:5]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[4:5]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[4:5]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GFX8_DPP-NEXT:    v_readlane_b32 s7, v4, 63
-; GFX8_DPP-NEXT:    v_readlane_b32 s6, v3, 63
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; GFX8_DPP-NEXT:    v_readlane_b32 s7, v1, 63
+; GFX8_DPP-NEXT:    v_readlane_b32 s6, v4, 63
 ; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB5_2
@@ -2970,8 +2967,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v8
 ; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v7
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8_DPP-NEXT:    v_add_u32_e32 v7, vcc, s5, v7
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2984,68 +2981,65 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-LABEL: add_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
 ; GFX9_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9_DPP-NEXT:    v_readlane_b32 s3, v4, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s2, v3, 63
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9_DPP-NEXT:    v_readlane_b32 s3, v1, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s2, v4, 63
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB5_2
@@ -3064,8 +3058,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9_DPP-NEXT:    v_readfirstlane_b32 s0, v8
 ; GFX9_DPP-NEXT:    v_readfirstlane_b32 s1, v7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9_DPP-NEXT:    v_add_co_u32_e32 v7, vcc, s1, v7
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3077,23 +3071,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1064_DPP-LABEL: add_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, 0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v4
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
@@ -3184,23 +3176,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1032_DPP-LABEL: add_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s0
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s0
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, 0, s0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v4
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
@@ -3275,31 +3265,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, 0
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -3388,53 +3377,53 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s0
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v4, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s7, v4, 15
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s8, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s6, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s7, 16
@@ -3478,31 +3467,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v9, 0
-; GFX1264_DPP-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
+; GFX1264_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s[0:1]
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s[0:1]
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -3594,29 +3582,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1232_DPP-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
+; GFX1232_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s0
-; GFX1232_DPP-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v6, v2
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s0
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s0
-; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1232_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -6611,68 +6600,65 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_DPP-LABEL: sub_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
 ; GFX8_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[4:5]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[4:5]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[4:5]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[4:5]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GFX8_DPP-NEXT:    v_readlane_b32 s7, v4, 63
-; GFX8_DPP-NEXT:    v_readlane_b32 s6, v3, 63
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; GFX8_DPP-NEXT:    v_readlane_b32 s7, v1, 63
+; GFX8_DPP-NEXT:    v_readlane_b32 s6, v4, 63
 ; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB11_2
@@ -6691,8 +6677,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v8
 ; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v7
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8_DPP-NEXT:    v_sub_u32_e32 v7, vcc, s5, v7
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6705,68 +6691,65 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-LABEL: sub_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
 ; GFX9_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9_DPP-NEXT:    v_readlane_b32 s3, v4, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s2, v3, 63
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9_DPP-NEXT:    v_readlane_b32 s3, v1, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s2, v4, 63
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB11_2
@@ -6785,8 +6768,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9_DPP-NEXT:    v_readfirstlane_b32 s0, v8
 ; GFX9_DPP-NEXT:    v_readfirstlane_b32 s1, v7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9_DPP-NEXT:    v_sub_co_u32_e32 v7, vcc, s1, v7
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6798,23 +6781,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1064_DPP-LABEL: sub_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, 0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v4
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
@@ -6905,23 +6886,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1032_DPP-LABEL: sub_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s0
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s0
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, 0, s0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v4
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
@@ -6996,31 +6975,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, 0
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -7109,53 +7087,53 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s0
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v4, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s7, v4, 15
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s8, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s6, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s7, 16
@@ -7199,31 +7177,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v9, 0
-; GFX1264_DPP-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
+; GFX1264_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s[0:1]
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s[0:1]
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -7315,29 +7292,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1232_DPP-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
+; GFX1232_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s0
-; GFX1232_DPP-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v6, v2
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s0
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s0
-; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1232_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 6bf03a202c1434..988bc8eec6e517 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -2361,84 +2361,82 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8_DPP-LABEL: add_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GFX8_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX8_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; GFX8_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX8_DPP-NEXT:    v_readlane_b32 s4, v4, 63
 ; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX8_DPP-NEXT:  ; %bb.1:
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
-; GFX8_DPP-NEXT:    ds_add_rtn_u64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT:    ds_add_rtn_u64 v[7:8], v9, v[7:8]
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:  .LBB6_2:
 ; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v10
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v9
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v7
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8_DPP-NEXT:    v_add_u32_e32 v7, vcc, s5, v7
 ; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
@@ -2450,83 +2448,81 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX9_DPP-LABEL: add_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s4, v4, 63
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, s4
-; GFX9_DPP-NEXT:    ds_add_rtn_u64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s4
+; GFX9_DPP-NEXT:    ds_add_rtn_u64 v[7:8], v9, v[7:8]
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:  .LBB6_2:
 ; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v10
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v9
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v7
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9_DPP-NEXT:    v_add_co_u32_e32 v7, vcc, s5, v7
 ; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
@@ -2538,23 +2534,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: add_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, 0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v4
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
@@ -2605,57 +2599,56 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s6, 32
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s7, 32
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s9, 48
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s8, 48
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1064_DPP-NEXT:    ds_add_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT:    ds_add_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB6_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v11
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, v2
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v12
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, v2
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v10
 ; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064_DPP-NEXT:    v_add_co_u32 v9, vcc, s0, v9
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc, s1, v10, vcc
+; GFX1064_DPP-NEXT:    v_add_co_u32 v9, vcc, s0, v11
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc, s1, v12, vcc
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[4:7], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
 ;
 ; GFX1032_DPP-LABEL: add_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s4
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s4
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, 0, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v4
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
@@ -2685,33 +2678,34 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v4, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s6, 16
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s5, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1032_DPP-NEXT:    ds_add_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1032_DPP-NEXT:    ds_add_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB6_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v11
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, v2
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v12
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, v2
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v10
 ; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032_DPP-NEXT:    v_add_co_u32 v9, vcc_lo, s0, v9
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, s1, v10, vcc_lo
+; GFX1032_DPP-NEXT:    v_add_co_u32 v9, vcc_lo, s0, v11
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, s1, v12, vcc_lo
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[4:7], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
@@ -2722,31 +2716,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, 0
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -2790,34 +2783,35 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v2, s7, 32
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s8, 48
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v2, s9, 48
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
 ; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr10_vgpr11
-; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v11, s1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, s0
-; GFX1164_DPP-NEXT:    ds_add_rtn_u64 v[10:11], v9, v[10:11]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, s1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s0
+; GFX1164_DPP-NEXT:    ds_add_rtn_u64 v[8:9], v0, v[8:9]
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_gl0_inv
 ; GFX1164_DPP-NEXT:  .LBB6_2:
 ; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v2
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, v1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v11, v2
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v9
 ; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_add_co_u32 v8, vcc, s0, v8
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v9, vcc, s1, v9, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32 v8, vcc, s0, v10
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v9, vcc, s1, v11, vcc
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
 ; GFX1164_DPP-NEXT:    s_nop 0
@@ -2830,77 +2824,77 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s4
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s4
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s4
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s4
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v4, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v4, 15
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s5, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v2, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v8
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr10_vgpr11
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
-; GFX1132_DPP-NEXT:    ds_add_rtn_u64 v[10:11], v9, v[10:11]
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0
+; GFX1132_DPP-NEXT:    ds_add_rtn_u64 v[8:9], v0, v[8:9]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB6_2:
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v2
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, v1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v11, v2
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v9
 ; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_add_co_u32 v8, vcc_lo, s0, v8
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, s1, v9, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32 v8, vcc_lo, s0, v10
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, s1, v11, vcc_lo
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
 ; GFX1132_DPP-NEXT:    s_nop 0
@@ -3163,164 +3157,158 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ;
 ; GFX8_DPP-LABEL: add_i64_varying_nouse:
 ; GFX8_DPP:       ; %bb.0: ; %entry
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v8
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v2, vcc, v4, v2, vcc
-; GFX8_DPP-NEXT:    v_readlane_b32 s3, v2, 63
-; GFX8_DPP-NEXT:    v_readlane_b32 s2, v1, 63
+; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8_DPP-NEXT:    v_readlane_b32 s3, v1, 63
+; GFX8_DPP-NEXT:    v_readlane_b32 s2, v2, 63
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX8_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX8_DPP-NEXT:  ; %bb.1:
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, s1
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s0
 ; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
-; GFX8_DPP-NEXT:    ds_add_u64 v8, v[9:10]
+; GFX8_DPP-NEXT:    ds_add_u64 v7, v[8:9]
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:  .LBB7_2:
 ; GFX8_DPP-NEXT:    s_endpgm
 ;
 ; GFX9_DPP-LABEL: add_i64_varying_nouse:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v8
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v1, vcc, v3, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v2, vcc, v4, v2, vcc
-; GFX9_DPP-NEXT:    v_readlane_b32 s3, v2, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s2, v1, 63
+; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_DPP-NEXT:    v_readlane_b32 s3, v1, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s2, v2, 63
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX9_DPP-NEXT:    ds_add_u64 v8, v[9:10]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, s1
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s0
+; GFX9_DPP-NEXT:    ds_add_u64 v7, v[8:9]
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:  .LBB7_2:
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
 ; GFX1064_DPP-LABEL: add_i64_varying_nouse:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, 0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v4
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
@@ -3345,16 +3333,17 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 32
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v2, 32
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064_DPP-NEXT:    s_add_u32 s0, s3, s4
 ; GFX1064_DPP-NEXT:    s_addc_u32 s1, s2, s5
-; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1064_DPP-NEXT:    ds_add_u64 v10, v[11:12]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT:    ds_add_u64 v0, v[9:10]
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB7_2:
@@ -3362,25 +3351,21 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ;
 ; GFX1032_DPP-LABEL: add_i64_varying_nouse:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s0
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, 0, s0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v4
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
@@ -3398,14 +3383,15 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, v2
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v11, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, v2
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v11
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    ds_add_u64 v10, v[11:12]
+; GFX1032_DPP-NEXT:    ds_add_u64 v0, v[9:10]
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB7_2:
@@ -3417,30 +3403,29 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v6, 0x3ff, v0
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v6, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v2
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v4, v1, vcc
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v5, vcc
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v4, v4 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
@@ -3460,15 +3445,16 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1164_DPP-NEXT:    v_add_co_u32 v2, vcc, v2, v3
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v1, v4, vcc
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v3
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
-; GFX1164_DPP-NEXT:    ds_add_u64 v7, v[8:9]
+; GFX1164_DPP-NEXT:    ds_add_u64 v0, v[6:7]
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_gl0_inv
 ; GFX1164_DPP-NEXT:  .LBB7_2:
@@ -3480,49 +3466,49 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, 0 :: v_dual_and_b32 v6, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v6, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s0
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v2
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v4, v4 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v3, 0, 0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v1, 0, 0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_add_co_u32 v2, vcc_lo, v3, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v1, v4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v3
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v7, v3
 ; GFX1132_DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    ds_add_u64 v7, v[8:9]
+; GFX1132_DPP-NEXT:    ds_add_u64 v0, v[6:7]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB7_2:
@@ -5909,84 +5895,82 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8_DPP-LABEL: sub_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GFX8_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX8_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; GFX8_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX8_DPP-NEXT:    v_readlane_b32 s4, v4, 63
 ; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX8_DPP-NEXT:  ; %bb.1:
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
-; GFX8_DPP-NEXT:    ds_sub_rtn_u64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT:    ds_sub_rtn_u64 v[7:8], v9, v[7:8]
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:  .LBB14_2:
 ; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v10
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v9
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v7
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8_DPP-NEXT:    v_sub_u32_e32 v7, vcc, s5, v7
 ; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
@@ -5998,83 +5982,81 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX9_DPP-LABEL: sub_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s4, v4, 63
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, s4
-; GFX9_DPP-NEXT:    ds_sub_rtn_u64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s4
+; GFX9_DPP-NEXT:    ds_sub_rtn_u64 v[7:8], v9, v[7:8]
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:  .LBB14_2:
 ; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v10
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v9
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v7
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9_DPP-NEXT:    v_sub_co_u32_e32 v7, vcc, s5, v7
 ; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
@@ -6086,23 +6068,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: sub_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, 0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v4
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
@@ -6153,57 +6133,56 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s6, 32
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s7, 32
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s9, 48
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s8, 48
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1064_DPP-NEXT:    ds_sub_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT:    ds_sub_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB14_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v11
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, v2
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v12
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, v2
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v10
 ; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064_DPP-NEXT:    v_sub_co_u32 v9, vcc, s0, v9
-; GFX1064_DPP-NEXT:    v_sub_co_ci_u32_e32 v10, vcc, s1, v10, vcc
+; GFX1064_DPP-NEXT:    v_sub_co_u32 v9, vcc, s0, v11
+; GFX1064_DPP-NEXT:    v_sub_co_ci_u32_e32 v10, vcc, s1, v12, vcc
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[4:7], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
 ;
 ; GFX1032_DPP-LABEL: sub_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s4
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s4
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, 0, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v4
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
@@ -6233,33 +6212,34 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v4, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s6, 16
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s5, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1032_DPP-NEXT:    ds_sub_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1032_DPP-NEXT:    ds_sub_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB14_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v11
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, v2
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v12
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, v2
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v10
 ; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032_DPP-NEXT:    v_sub_co_u32 v9, vcc_lo, s0, v9
-; GFX1032_DPP-NEXT:    v_sub_co_ci_u32_e32 v10, vcc_lo, s1, v10, vcc_lo
+; GFX1032_DPP-NEXT:    v_sub_co_u32 v9, vcc_lo, s0, v11
+; GFX1032_DPP-NEXT:    v_sub_co_ci_u32_e32 v10, vcc_lo, s1, v12, vcc_lo
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[4:7], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
@@ -6270,31 +6250,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, 0
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -6338,34 +6317,35 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v2, s7, 32
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s8, 48
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v2, s9, 48
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
 ; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr10_vgpr11
-; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v11, s1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, s0
-; GFX1164_DPP-NEXT:    ds_sub_rtn_u64 v[10:11], v9, v[10:11]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, s1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s0
+; GFX1164_DPP-NEXT:    ds_sub_rtn_u64 v[8:9], v0, v[8:9]
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_gl0_inv
 ; GFX1164_DPP-NEXT:  .LBB14_2:
 ; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v2
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, v1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v11, v2
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v9
 ; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_sub_co_u32 v8, vcc, s0, v8
-; GFX1164_DPP-NEXT:    v_sub_co_ci_u32_e32 v9, vcc, s1, v9, vcc
+; GFX1164_DPP-NEXT:    v_sub_co_u32 v8, vcc, s0, v10
+; GFX1164_DPP-NEXT:    v_sub_co_ci_u32_e32 v9, vcc, s1, v11, vcc
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
 ; GFX1164_DPP-NEXT:    s_nop 0
@@ -6378,77 +6358,77 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s4
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s4
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s4
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s4
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v4, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v4, 15
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s5, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v2, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v8
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr10_vgpr11
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
-; GFX1132_DPP-NEXT:    ds_sub_rtn_u64 v[10:11], v9, v[10:11]
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0
+; GFX1132_DPP-NEXT:    ds_sub_rtn_u64 v[8:9], v0, v[8:9]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB14_2:
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v2
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, v1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v11, v2
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v9
 ; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_sub_co_u32 v8, vcc_lo, s0, v8
-; GFX1132_DPP-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, s1, v9, vcc_lo
+; GFX1132_DPP-NEXT:    v_sub_co_u32 v8, vcc_lo, s0, v10
+; GFX1132_DPP-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, s1, v11, vcc_lo
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
 ; GFX1132_DPP-NEXT:    s_nop 0
@@ -7432,129 +7412,121 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8_DPP-LABEL: and_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v5, s[0:1]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v6, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX8_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, 0, s[0:1]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, -1
+; GFX8_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX8_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, -1
+; GFX8_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX8_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX8_DPP-NEXT:    v_readlane_b32 s4, v2, 63
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GFX8_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB16_2
 ; GFX8_DPP-NEXT:  ; %bb.1:
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s4
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, s5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
-; GFX8_DPP-NEXT:    ds_and_rtn_b64 v[7:8], v6, v[7:8]
+; GFX8_DPP-NEXT:    ds_and_rtn_b64 v[5:6], v7, v[5:6]
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:  .LBB16_2:
 ; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v8
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v7
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX8_DPP-NEXT:    v_and_b32_e32 v6, s4, v6
-; GFX8_DPP-NEXT:    v_and_b32_e32 v5, s5, v5
+; GFX8_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8_DPP-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8_DPP-NEXT:    v_and_b32_e32 v6, s0, v6
+; GFX8_DPP-NEXT:    v_and_b32_e32 v5, s1, v5
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
+; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[4:7], 0
 ; GFX8_DPP-NEXT:    s_endpgm
 ;
 ; GFX9_DPP-LABEL: and_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v5, s[0:1]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v6, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX9_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, 0, s[0:1]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, -1
+; GFX9_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX9_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, -1
+; GFX9_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s4, v2, 63
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GFX9_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB16_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s4
-; GFX9_DPP-NEXT:    ds_and_rtn_b64 v[7:8], v6, v[7:8]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, s5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9_DPP-NEXT:    ds_and_rtn_b64 v[5:6], v7, v[5:6]
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:  .LBB16_2:
 ; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v8
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX9_DPP-NEXT:    v_and_b32_e32 v6, s4, v6
-; GFX9_DPP-NEXT:    v_and_b32_e32 v5, s5, v5
+; GFX9_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9_DPP-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9_DPP-NEXT:    v_and_b32_e32 v6, s0, v6
+; GFX9_DPP-NEXT:    v_and_b32_e32 v5, s1, v5
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
+; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[4:7], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
 ; GFX1064_DPP-LABEL: and_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v7, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v8, s[0:1]
-; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, 0, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, -1
+; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -7575,63 +7547,62 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v2, 15
-; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v2, 31
-; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v1, 31
-; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 15
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v2, 15
+; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 31
+; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v2, 31
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s4, 16
+; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v2, 63
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s5, 16
-; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v2, 63
-; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v2, 47
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s6, 32
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s7, 32
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
-; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s9, 48
-; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s8, 48
+; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s8, 48
+; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s9, 48
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB16_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX1064_DPP-NEXT:    ds_and_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, s0
+; GFX1064_DPP-NEXT:    ds_and_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB16_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064_DPP-NEXT:    v_and_b32_e32 v8, s0, v8
-; GFX1064_DPP-NEXT:    v_and_b32_e32 v7, s1, v7
+; GFX1064_DPP-NEXT:    v_and_b32_e32 v9, s0, v9
+; GFX1064_DPP-NEXT:    v_and_b32_e32 v8, s1, v8
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[4:7], 0
+; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
 ;
 ; GFX1032_DPP-LABEL: and_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v7, s4
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v8, s4
-; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, 0, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, -1
+; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -7643,199 +7614,201 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v1, 31
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 31
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v2, 15
-; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v2, 31
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v2, 31
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT:    v_writelane_b32 v6, s6, 16
-; GFX1032_DPP-NEXT:    v_writelane_b32 v5, s5, 16
+; GFX1032_DPP-NEXT:    v_writelane_b32 v6, s5, 16
+; GFX1032_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB16_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX1032_DPP-NEXT:    ds_and_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, s1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, s0
+; GFX1032_DPP-NEXT:    ds_and_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB16_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032_DPP-NEXT:    v_and_b32_e32 v8, s0, v8
-; GFX1032_DPP-NEXT:    v_and_b32_e32 v7, s1, v7
+; GFX1032_DPP-NEXT:    v_and_b32_e32 v9, s0, v9
+; GFX1032_DPP-NEXT:    v_and_b32_e32 v8, s1, v8
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[4:7], 0
+; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
 ;
 ; GFX1164_DPP-LABEL: and_i64_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v7, 0x3ff, v0
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v7, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v8, s[0:1]
-; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
-; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s5
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v2, 15
-; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v2, 31
-; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v1, 31
-; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v1, 47
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 15
+; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 31
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s4, 16
+; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v2, 31
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v2, 63
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s5, 16
-; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 63
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v2, 47
+; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v1, 47
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s6, 32
+; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v2, 47
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s7, 32
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
-; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s9, 48
-; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s8, 48
+; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s8, 48
+; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s9, 48
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
-; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB16_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX1164_DPP-NEXT:    ds_and_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s0
+; GFX1164_DPP-NEXT:    ds_and_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_gl0_inv
 ; GFX1164_DPP-NEXT:  .LBB16_2:
 ; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v8, s0, v8
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v7, s1, v7
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v9, s0, v9
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v8, s1, v8
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[4:7], 0
+; GFX1164_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
 ; GFX1164_DPP-NEXT:    s_nop 0
 ; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: and_i64_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v7, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v8, s4
-; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, 0, s4
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s4
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, -1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
-; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v1, 31
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 31
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
-; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v2, 31
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v2, 31
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s6, 16
-; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s5, 16
+; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s5, 16
+; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB16_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0
-; GFX1132_DPP-NEXT:    ds_and_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
+; GFX1132_DPP-NEXT:    ds_and_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB16_2:
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_and_b32_e32 v8, s0, v8
-; GFX1132_DPP-NEXT:    v_and_b32_e32 v7, s1, v7
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v9, s0, v9
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v8, s1, v8
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[4:7], 0
+; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
 ; GFX1132_DPP-NEXT:    s_nop 0
 ; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
@@ -8816,54 +8789,52 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8_DPP-LABEL: or_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[0:1]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX8_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX8_DPP-NEXT:    v_readlane_b32 s4, v2, 63
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GFX8_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX8_DPP-NEXT:  ; %bb.1:
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s4
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, s5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
-; GFX8_DPP-NEXT:    ds_or_rtn_b64 v[7:8], v6, v[7:8]
+; GFX8_DPP-NEXT:    ds_or_rtn_b64 v[5:6], v7, v[5:6]
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:  .LBB18_2:
 ; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v8
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v7
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v6
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX8_DPP-NEXT:    v_or_b32_e32 v6, s4, v6
@@ -8874,53 +8845,51 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX9_DPP-LABEL: or_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[0:1]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_or_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_or_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s4, v2, 63
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GFX9_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s4
-; GFX9_DPP-NEXT:    ds_or_rtn_b64 v[7:8], v6, v[7:8]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, s5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9_DPP-NEXT:    ds_or_rtn_b64 v[5:6], v7, v[5:6]
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:  .LBB18_2:
 ; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v8
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v6
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX9_DPP-NEXT:    v_or_b32_e32 v6, s4, v6
@@ -8931,14 +8900,12 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: or_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s[0:1]
-; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -8959,63 +8926,62 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v2, 15
-; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v2, 31
-; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v1, 31
-; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 15
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v2, 15
+; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 31
+; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v2, 31
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s4, 16
+; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v2, 63
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s5, 16
-; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v2, 63
-; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v2, 47
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s6, 32
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s7, 32
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
-; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s9, 48
-; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s8, 48
+; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s8, 48
+; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s9, 48
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX1064_DPP-NEXT:    ds_or_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, s0
+; GFX1064_DPP-NEXT:    ds_or_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB18_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064_DPP-NEXT:    v_or_b32_e32 v8, s0, v8
-; GFX1064_DPP-NEXT:    v_or_b32_e32 v7, s1, v7
+; GFX1064_DPP-NEXT:    v_or_b32_e32 v9, s0, v9
+; GFX1064_DPP-NEXT:    v_or_b32_e32 v8, s1, v8
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[4:7], 0
+; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
 ;
 ; GFX1032_DPP-LABEL: or_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s4
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s4
-; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -9027,199 +8993,201 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v1, 31
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 31
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v2, 15
-; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v2, 31
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v2, 31
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT:    v_writelane_b32 v6, s6, 16
-; GFX1032_DPP-NEXT:    v_writelane_b32 v5, s5, 16
+; GFX1032_DPP-NEXT:    v_writelane_b32 v6, s5, 16
+; GFX1032_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX1032_DPP-NEXT:    ds_or_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, s1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, s0
+; GFX1032_DPP-NEXT:    ds_or_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB18_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032_DPP-NEXT:    v_or_b32_e32 v8, s0, v8
-; GFX1032_DPP-NEXT:    v_or_b32_e32 v7, s1, v7
+; GFX1032_DPP-NEXT:    v_or_b32_e32 v9, s0, v9
+; GFX1032_DPP-NEXT:    v_or_b32_e32 v8, s1, v8
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[4:7], 0
+; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
 ;
 ; GFX1164_DPP-LABEL: or_i64_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v7, 0x3ff, v0
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s[0:1]
-; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
-; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s5
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v2, 15
-; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v2, 31
-; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v1, 31
-; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v1, 47
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 15
+; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 31
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s4, 16
+; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v2, 31
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v2, 63
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s5, 16
-; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 63
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v2, 47
+; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v1, 47
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s6, 32
+; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v2, 47
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s7, 32
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
-; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s9, 48
-; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s8, 48
+; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s8, 48
+; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s9, 48
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
-; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX1164_DPP-NEXT:    ds_or_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s0
+; GFX1164_DPP-NEXT:    ds_or_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_gl0_inv
 ; GFX1164_DPP-NEXT:  .LBB18_2:
 ; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_or_b32_e32 v8, s0, v8
-; GFX1164_DPP-NEXT:    v_or_b32_e32 v7, s1, v7
+; GFX1164_DPP-NEXT:    v_or_b32_e32 v9, s0, v9
+; GFX1164_DPP-NEXT:    v_or_b32_e32 v8, s1, v8
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[4:7], 0
+; GFX1164_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
 ; GFX1164_DPP-NEXT:    s_nop 0
 ; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: or_i64_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s4
-; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s4
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s4
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
-; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v1, 31
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 31
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
-; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v2, 31
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v2, 31
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s6, 16
-; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s5, 16
+; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s5, 16
+; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0
-; GFX1132_DPP-NEXT:    ds_or_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
+; GFX1132_DPP-NEXT:    ds_or_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB18_2:
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_or_b32_e32 v8, s0, v8
-; GFX1132_DPP-NEXT:    v_or_b32_e32 v7, s1, v7
+; GFX1132_DPP-NEXT:    v_or_b32_e32 v9, s0, v9
+; GFX1132_DPP-NEXT:    v_or_b32_e32 v8, s1, v8
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[4:7], 0
+; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
 ; GFX1132_DPP-NEXT:    s_nop 0
 ; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
@@ -10200,54 +10168,52 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8_DPP-LABEL: xor_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[0:1]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX8_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX8_DPP-NEXT:    v_readlane_b32 s4, v2, 63
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GFX8_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX8_DPP-NEXT:  ; %bb.1:
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s4
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, s5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
-; GFX8_DPP-NEXT:    ds_xor_rtn_b64 v[7:8], v6, v[7:8]
+; GFX8_DPP-NEXT:    ds_xor_rtn_b64 v[5:6], v7, v[5:6]
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:  .LBB20_2:
 ; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v8
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v7
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v6
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX8_DPP-NEXT:    v_xor_b32_e32 v6, s4, v6
@@ -10258,53 +10224,51 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX9_DPP-LABEL: xor_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[0:1]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_xor_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s4, v2, 63
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GFX9_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s4
-; GFX9_DPP-NEXT:    ds_xor_rtn_b64 v[7:8], v6, v[7:8]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, s5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9_DPP-NEXT:    ds_xor_rtn_b64 v[5:6], v7, v[5:6]
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:  .LBB20_2:
 ; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v8
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v6
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX9_DPP-NEXT:    v_xor_b32_e32 v6, s4, v6
@@ -10315,14 +10279,12 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: xor_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s[0:1]
-; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -10343,63 +10305,62 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v2, 15
-; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v2, 31
-; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v1, 31
-; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 15
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v2, 15
+; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 31
+; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v2, 31
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s4, 16
+; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v2, 63
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s5, 16
-; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v2, 63
-; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v2, 47
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s6, 32
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s7, 32
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
-; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s9, 48
-; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s8, 48
+; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s8, 48
+; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s9, 48
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX1064_DPP-NEXT:    ds_xor_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, s0
+; GFX1064_DPP-NEXT:    ds_xor_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB20_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064_DPP-NEXT:    v_xor_b32_e32 v8, s0, v8
-; GFX1064_DPP-NEXT:    v_xor_b32_e32 v7, s1, v7
+; GFX1064_DPP-NEXT:    v_xor_b32_e32 v9, s0, v9
+; GFX1064_DPP-NEXT:    v_xor_b32_e32 v8, s1, v8
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[4:7], 0
+; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
 ;
 ; GFX1032_DPP-LABEL: xor_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s4
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s4
-; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -10411,199 +10372,201 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v1, 31
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 31
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v2, 15
-; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v2, 31
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v2, 31
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT:    v_writelane_b32 v6, s6, 16
-; GFX1032_DPP-NEXT:    v_writelane_b32 v5, s5, 16
+; GFX1032_DPP-NEXT:    v_writelane_b32 v6, s5, 16
+; GFX1032_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX1032_DPP-NEXT:    ds_xor_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, s1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, s0
+; GFX1032_DPP-NEXT:    ds_xor_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB20_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032_DPP-NEXT:    v_xor_b32_e32 v8, s0, v8
-; GFX1032_DPP-NEXT:    v_xor_b32_e32 v7, s1, v7
+; GFX1032_DPP-NEXT:    v_xor_b32_e32 v9, s0, v9
+; GFX1032_DPP-NEXT:    v_xor_b32_e32 v8, s1, v8
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[4:7], 0
+; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
 ;
 ; GFX1164_DPP-LABEL: xor_i64_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v7, 0x3ff, v0
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s[0:1]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s[0:1]
-; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
-; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s5
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v2, 15
-; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v2, 31
-; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v1, 31
-; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v1, 47
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 15
+; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 31
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s4, 16
+; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v2, 31
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v2, 63
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s5, 16
-; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v2, 63
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v2, 47
+; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 63
+; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v1, 47
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s6, 32
+; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v2, 47
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s7, 32
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
-; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s9, 48
-; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s8, 48
+; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s8, 48
+; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s9, 48
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
-; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX1164_DPP-NEXT:    ds_xor_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s0
+; GFX1164_DPP-NEXT:    ds_xor_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_gl0_inv
 ; GFX1164_DPP-NEXT:  .LBB20_2:
 ; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_xor_b32_e32 v8, s0, v8
-; GFX1164_DPP-NEXT:    v_xor_b32_e32 v7, s1, v7
+; GFX1164_DPP-NEXT:    v_xor_b32_e32 v9, s0, v9
+; GFX1164_DPP-NEXT:    v_xor_b32_e32 v8, s1, v8
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[4:7], 0
+; GFX1164_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
 ; GFX1164_DPP-NEXT:    s_nop 0
 ; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: xor_i64_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v7, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s4
-; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s4
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s4
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
-; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v1, 31
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 31
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
-; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v2, 31
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v2, 31
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s6, 16
-; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s5, 16
+; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s5, 16
+; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0
-; GFX1132_DPP-NEXT:    ds_xor_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
+; GFX1132_DPP-NEXT:    ds_xor_rtn_b64 v[7:8], v0, v[7:8]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB20_2:
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v10
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v6
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_xor_b32_e32 v8, s0, v8
-; GFX1132_DPP-NEXT:    v_xor_b32_e32 v7, s1, v7
+; GFX1132_DPP-NEXT:    v_xor_b32_e32 v9, s0, v9
+; GFX1132_DPP-NEXT:    v_xor_b32_e32 v8, s1, v8
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[4:7], 0
+; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
 ; GFX1132_DPP-NEXT:    s_nop 0
 ; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
@@ -10934,11 +10897,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT:    s_mov_b64 exec, -1
-; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -10982,11 +10942,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT:    s_mov_b64 exec, -1
-; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -11885,20 +11842,16 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-LABEL: max_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
 ; GFX8_DPP-NEXT:    s_mov_b32 s0, 0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, 0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX8_DPP-NEXT:    s_brev_b32 s1, 1
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX8_DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, s1
+; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, v1, 0, s[4:5]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[4:5]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
@@ -11953,22 +11906,22 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8_DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX8_DPP-NEXT:  ; %bb.1:
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
-; GFX8_DPP-NEXT:    ds_max_rtn_i64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT:    ds_max_rtn_i64 v[7:8], v9, v[7:8]
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:  .LBB23_2:
 ; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v8
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v7
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v9
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX8_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s5
@@ -11984,20 +11937,16 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-LABEL: max_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
 ; GFX9_DPP-NEXT:    s_mov_b32 s0, 0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9_DPP-NEXT:    s_brev_b32 s1, 1
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9_DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, v1, 0, s[4:5]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[4:5]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
@@ -12052,21 +12001,21 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9_DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, s4
-; GFX9_DPP-NEXT:    ds_max_rtn_i64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s4
+; GFX9_DPP-NEXT:    ds_max_rtn_i64 v[7:8], v9, v[7:8]
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:  .LBB23_2:
 ; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v8
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v7
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v9
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s5
@@ -12081,15 +12030,13 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: max_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1064_DPP-NEXT:    s_brev_b32 s5, 1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v0
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s5
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, s4, v9, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, s5, v10, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, 0, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
@@ -12156,30 +12103,31 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s6, 32
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s7, 32
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s8, 48
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s9, 48
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1064_DPP-NEXT:    ds_max_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT:    ds_max_rtn_i64 v[9:10], v0, v[9:10]
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB23_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[9:10]
@@ -12191,15 +12139,13 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032_DPP-LABEL: max_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    s_mov_b32 s0, 0
 ; GFX1032_DPP-NEXT:    s_brev_b32 s1, 1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, 0, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, s0, v9, s4
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, s1, v10, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
@@ -12244,29 +12190,30 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v4, 15
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s5, 16
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s6, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1032_DPP-NEXT:    ds_max_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1032_DPP-NEXT:    ds_max_rtn_i64 v[9:10], v0, v[9:10]
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB23_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[9:10]
@@ -12278,79 +12225,79 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1164_DPP-LABEL: max_i64_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1164_DPP-NEXT:    s_mov_b32 s0, 0
-; GFX1164_DPP-NEXT:    s_brev_b32 s1, 1
-; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v9, 0x3ff, v0
-; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, s0, v9, s[4:5]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v6, s1, v10, s[4:5]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT:    s_mov_b32 s4, 0
+; GFX1164_DPP-NEXT:    s_brev_b32 s5, 1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[5:6], v[3:4]
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_readlane_b32 s0, v4, 31
-; GFX1164_DPP-NEXT:    v_readlane_b32 s1, v3, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s0
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s1
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v4, 31
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v3, 31
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -12369,29 +12316,30 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s7, 32
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v2, s8, 48
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s9, 48
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
-; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1164_DPP-NEXT:    ds_max_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1164_DPP-NEXT:    ds_max_rtn_i64 v[9:10], v0, v[9:10]
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_gl0_inv
 ; GFX1164_DPP-NEXT:  .LBB23_2:
 ; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -12406,55 +12354,54 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1132_DPP-LABEL: max_i64_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    s_mov_b32 s0, 0
 ; GFX1132_DPP-NEXT:    s_brev_b32 s1, 1
-; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    v_and_b32_e32 v9, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, s0, v9, s4
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v6, s1, v10, s4
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, 0, s4
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s4
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v4, 31
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -12462,27 +12409,28 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v2, s5, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0
-; GFX1132_DPP-NEXT:    ds_max_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0
+; GFX1132_DPP-NEXT:    ds_max_rtn_i64 v[9:10], v0, v[9:10]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB23_2:
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -12821,11 +12769,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v1, -2
-; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT:    s_mov_b64 exec, -1
-; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_nop 1
 ; GFX8_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -12869,11 +12814,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v1, -2
-; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT:    s_mov_b64 exec, -1
-; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -13771,21 +13713,17 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8_DPP-LABEL: min_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX8_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX8_DPP-NEXT:    s_brev_b32 s7, -2
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, s7
-; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX8_DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v1, -2
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, s7
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, v1, 0, s[0:1]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
@@ -13839,22 +13777,22 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX8_DPP-NEXT:  ; %bb.1:
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
-; GFX8_DPP-NEXT:    ds_min_rtn_i64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT:    ds_min_rtn_i64 v[7:8], v9, v[7:8]
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:  .LBB26_2:
 ; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s1, v8
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s0, v7
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX8_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s1, v10
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX8_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[7:8]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s1
@@ -13868,21 +13806,17 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX9_DPP-LABEL: min_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX9_DPP-NEXT:    s_brev_b32 s7, -2
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9_DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v1, -2
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, v1, 0, s[0:1]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
@@ -13936,21 +13870,21 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, s4
-; GFX9_DPP-NEXT:    ds_min_rtn_i64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s4
+; GFX9_DPP-NEXT:    ds_min_rtn_i64 v[7:8], v9, v[7:8]
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:  .LBB26_2:
 ; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s1, v8
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s0, v7
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX9_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s1, v10
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[7:8]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s1
@@ -13964,15 +13898,13 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: min_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1064_DPP-NEXT:    s_brev_b32 s7, -2
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s7
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, s6, v9, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, s7, v10, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fffffff, 0, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
@@ -14039,29 +13971,30 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s7, 32
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s8, 32
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s10, 48
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s11, 48
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1064_DPP-NEXT:    ds_min_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT:    ds_min_rtn_i64 v[9:10], v0, v[9:10]
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB26_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[9:10]
@@ -14073,15 +14006,13 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032_DPP-LABEL: min_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1032_DPP-NEXT:    s_brev_b32 s7, -2
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, s7
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, s6, v9, s4
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, s7, v10, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fffffff, 0, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v0, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
@@ -14126,28 +14057,29 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s7, v3, 15
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s5, 16
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s7, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
+; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1032_DPP-NEXT:    ds_min_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1032_DPP-NEXT:    ds_min_rtn_i64 v[9:10], v0, v[9:10]
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB26_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[9:10]
@@ -14159,79 +14091,79 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1164_DPP-LABEL: min_i64_varying:
 ; GFX1164_DPP:       ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1164_DPP-NEXT:    s_brev_b32 s7, -2
-; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v9, 0x3ff, v0
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, s6, v9, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v6, s7, v10, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fffffff, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[5:6], v[3:4]
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v4, 31
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v3, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -14250,29 +14182,29 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s8, 32
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v2, s10, 48
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s11, 48
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
-; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1164_DPP-NEXT:    ds_min_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1164_DPP-NEXT:    ds_min_rtn_i64 v[9:10], v0, v[9:10]
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_gl0_inv
 ; GFX1164_DPP-NEXT:  .LBB26_2:
 ; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -14287,82 +14219,82 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1132_DPP-LABEL: min_i64_varying:
 ; GFX1132_DPP:       ; %bb.0: ; %entry
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT:    s_brev_b32 s7, -2
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132_DPP-NEXT:    s_brev_b32 s7, -2
+; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_and_b32_e32 v9, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, s6, v9, s4
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v6, s7, v10, s4
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fffffff, 0, s4
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v0, s4
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v4, 31
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v4, 15
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s7, v3, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v2, s5, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s7, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
+; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0
-; GFX1132_DPP-NEXT:    ds_min_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0
+; GFX1132_DPP-NEXT:    ds_min_rtn_i64 v[9:10], v0, v[9:10]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB26_2:
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -15634,85 +15566,84 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8_DPP-LABEL: umax_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, 0, s[0:1]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX8_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_readlane_b32 s5, v2, 63
+; GFX8_DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX8_DPP-NEXT:  ; %bb.1:
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
-; GFX8_DPP-NEXT:    ds_max_rtn_u64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT:    ds_max_rtn_u64 v[7:8], v9, v[7:8]
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:  .LBB29_2:
 ; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v8
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v7
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
@@ -15726,84 +15657,83 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX9_DPP-LABEL: umax_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, 0, s[0:1]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_readlane_b32 s5, v2, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, s4
-; GFX9_DPP-NEXT:    ds_max_rtn_u64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s4
+; GFX9_DPP-NEXT:    ds_max_rtn_u64 v[7:8], v9, v[7:8]
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:  .LBB29_2:
 ; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v8
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v7
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
@@ -15817,18 +15747,14 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064_DPP-LABEL: umax_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, 0, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -15892,30 +15818,31 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s6, 32
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s7, 32
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s8, 48
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s9, 48
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1064_DPP-NEXT:    ds_max_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT:    ds_max_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB29_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[9:10]
@@ -15927,13 +15854,11 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032_DPP-LABEL: umax_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v9, s4
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, 0, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
@@ -15978,29 +15903,30 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v4, 15
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s5, 16
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s6, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1032_DPP-NEXT:    ds_max_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1032_DPP-NEXT:    ds_max_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB29_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[9:10]
@@ -16016,73 +15942,73 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v9, 0x3ff, v0
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[5:6], v[3:4]
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v4, 31
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v3, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -16101,29 +16027,30 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s7, 32
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v2, s8, 48
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s9, 48
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
-; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1164_DPP-NEXT:    ds_max_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1164_DPP-NEXT:    ds_max_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_gl0_inv
 ; GFX1164_DPP-NEXT:  .LBB29_2:
 ; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -16142,49 +16069,49 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v9, s4
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s4
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, 0, s4
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s4
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v4, 31
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -16192,27 +16119,28 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v2, s5, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0
-; GFX1132_DPP-NEXT:    ds_max_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0
+; GFX1132_DPP-NEXT:    ds_max_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB29_2:
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -17485,201 +17413,193 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8_DPP-LABEL: umin_i64_varying:
 ; GFX8_DPP:       ; %bb.0: ; %entry
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v7, s[0:1]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v8, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, -1
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, -1
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, 0, s[0:1]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8_DPP-NEXT:    s_nop 0
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX8_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT:    v_readlane_b32 s5, v2, 63
+; GFX8_DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8_DPP-NEXT:    s_cbranch_execz .LBB32_2
 ; GFX8_DPP-NEXT:  ; %bb.1:
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
-; GFX8_DPP-NEXT:    ds_min_rtn_u64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT:    ds_min_rtn_u64 v[7:8], v9, v[7:8]
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:  .LBB32_2:
 ; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s1, v8
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s0, v7
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX8_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[7:8]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
-; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8_DPP-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[4:7], 0
 ; GFX8_DPP-NEXT:    s_endpgm
 ;
 ; GFX9_DPP-LABEL: umin_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
 ; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, -1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v7, s[0:1]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v8, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, -1
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, -1
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, 0, s[0:1]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9_DPP-NEXT:    s_nop 0
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT:    v_readlane_b32 s5, v4, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s4, v3, 63
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT:    v_readlane_b32 s5, v2, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB32_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v10, s5
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, s4
-; GFX9_DPP-NEXT:    ds_min_rtn_u64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s4
+; GFX9_DPP-NEXT:    ds_min_rtn_u64 v[7:8], v9, v[7:8]
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:  .LBB32_2:
 ; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s1, v8
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s0, v7
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[7:8]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9_DPP-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[4:7], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
 ; GFX1064_DPP-LABEL: umin_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v9, s[0:1]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v10, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, 0, s[0:1]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v0, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -17743,30 +17663,31 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s6, 32
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s7, 32
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s8, 48
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s9, 48
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB32_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1064_DPP-NEXT:    ds_min_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT:    ds_min_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB32_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[9:10]
@@ -17778,13 +17699,11 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032_DPP-LABEL: umin_i64_varying:
 ; GFX1032_DPP:       ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, -1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, -1
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v9, s4
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v10, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, 0, s4
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v0, s4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
@@ -17829,29 +17748,30 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v4, 15
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s5, 16
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s6, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB32_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1032_DPP-NEXT:    ds_min_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1032_DPP-NEXT:    ds_min_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB32_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032_DPP-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[9:10]
@@ -17867,73 +17787,73 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, -1
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, 0
-; GFX1164_DPP-NEXT:    v_and_b32_e32 v9, 0x3ff, v0
+; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v5, -1, v9, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v6, -1, v10, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, 0, s[0:1]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v0, s[0:1]
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[5:6], v[3:4]
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v4, 31
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v3, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -17952,29 +17872,30 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s7, 32
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[4:5]
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v2, s8, 48
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s9, 48
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
-; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB32_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v12, s1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v11, s0
-; GFX1164_DPP-NEXT:    ds_min_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, s1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1164_DPP-NEXT:    ds_min_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_gl0_inv
 ; GFX1164_DPP-NEXT:  .LBB32_2:
 ; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -17993,49 +17914,49 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, -1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, -1
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0
+; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v5, -1, v9, s4
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v6, -1, v10, s4
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, 0, s4
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v0, s4
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v4, 31
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -18043,27 +17964,28 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v2, s5, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr11_vgpr12
-; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB32_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0
-; GFX1132_DPP-NEXT:    ds_min_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0
+; GFX1132_DPP-NEXT:    ds_min_rtn_u64 v[9:10], v0, v[9:10]
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_gl0_inv
 ; GFX1132_DPP-NEXT:  .LBB32_2:
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132_DPP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v9
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s1, v12
-; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s0, v11
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)

diff  --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
index 6bd0b11acc3ea2..d62ff3795d8b8c 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
@@ -59,7 +59,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX908-NEXT:   [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec
   ; GFX908-NEXT:   [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
   ; GFX908-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
-  ; GFX908-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec
+  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF]], implicit $exec
   ; GFX908-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
   ; GFX908-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
   ; GFX908-NEXT:   [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
@@ -120,7 +121,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX90A_GFX940-NEXT:   [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec
   ; GFX90A_GFX940-NEXT:   [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
   ; GFX90A_GFX940-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
-  ; GFX90A_GFX940-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec
+  ; GFX90A_GFX940-NEXT:   [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+  ; GFX90A_GFX940-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF]], implicit $exec
   ; GFX90A_GFX940-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
   ; GFX90A_GFX940-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
   ; GFX90A_GFX940-NEXT:   [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
@@ -177,7 +179,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX11_GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX11_GFX12-NEXT:   [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[S_MOV_B32_]], implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
-  ; GFX11_GFX12-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec
+  ; GFX11_GFX12-NEXT:   [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+  ; GFX11_GFX12-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF]], implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 353, 15, 15, 0, implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 354, 15, 15, 0, implicit $exec
@@ -186,8 +189,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX11_GFX12-NEXT:   [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 360, 15, 15, 0, implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
-  ; GFX11_GFX12-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; GFX11_GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+  ; GFX11_GFX12-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX11_GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF1]]
   ; GFX11_GFX12-NEXT:   [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_]], 0, [[S_MOV_B32_]], [[COPY5]], 0, implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_PERMLANEX16_B32_e64_]], 0, 0, implicit $mode, implicit $exec
   ; GFX11_GFX12-NEXT:   early-clobber %1:vgpr_32 = STRICT_WWM killed [[V_ADD_F32_e64_4]], implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
index 6766c0c1fdaeb0..946ee9eb9c0653 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
@@ -61,7 +61,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX90A-NEXT:   [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec
   ; GFX90A-NEXT:   [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
   ; GFX90A-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
-  ; GFX90A-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec
+  ; GFX90A-NEXT:   [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF1]], implicit $exec
   ; GFX90A-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
   ; GFX90A-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
   ; GFX90A-NEXT:   [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
@@ -80,7 +81,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX90A-NEXT:   early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec
   ; GFX90A-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
   ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
-  ; GFX90A-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
   ; GFX90A-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX90A-NEXT:   S_BRANCH %bb.2
   ; GFX90A-NEXT: {{  $}}
@@ -102,11 +103,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX90A-NEXT: bb.4 (%ir-block.35):
   ; GFX90A-NEXT:   successors: %bb.3(0x80000000)
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
+  ; GFX90A-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
   ; GFX90A-NEXT:   SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX90A-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
-  ; GFX90A-NEXT:   early-clobber %45:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
-  ; GFX90A-NEXT:   [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %45, 0, 0, implicit $mode, implicit $exec
+  ; GFX90A-NEXT:   early-clobber %46:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
+  ; GFX90A-NEXT:   [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %46, 0, 0, implicit $mode, implicit $exec
   ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]]
   ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
   ; GFX90A-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec
@@ -142,7 +143,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX940-NEXT:   [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec
   ; GFX940-NEXT:   [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
   ; GFX940-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
-  ; GFX940-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec
+  ; GFX940-NEXT:   [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+  ; GFX940-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF1]], implicit $exec
   ; GFX940-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
   ; GFX940-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
   ; GFX940-NEXT:   [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
@@ -161,7 +163,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX940-NEXT:   early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec
   ; GFX940-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
   ; GFX940-NEXT:   [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
-  ; GFX940-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX940-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
   ; GFX940-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX940-NEXT:   S_BRANCH %bb.2
   ; GFX940-NEXT: {{  $}}
@@ -183,11 +185,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX940-NEXT: bb.4 (%ir-block.35):
   ; GFX940-NEXT:   successors: %bb.3(0x80000000)
   ; GFX940-NEXT: {{  $}}
-  ; GFX940-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
+  ; GFX940-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
   ; GFX940-NEXT:   SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX940-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
-  ; GFX940-NEXT:   early-clobber %44:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
-  ; GFX940-NEXT:   [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec
+  ; GFX940-NEXT:   early-clobber %45:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
+  ; GFX940-NEXT:   [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %45, 0, 0, implicit $mode, implicit $exec
   ; GFX940-NEXT:   [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]]
   ; GFX940-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
   ; GFX940-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec
@@ -219,7 +221,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX11-NEXT:   [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[S_MOV_B32_]], implicit $exec
   ; GFX11-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
-  ; GFX11-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec
+  ; GFX11-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+  ; GFX11-NEXT:   [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF1]], implicit $exec
   ; GFX11-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
   ; GFX11-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
   ; GFX11-NEXT:   [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
@@ -229,8 +232,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec
   ; GFX11-NEXT:   [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
   ; GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-  ; GFX11-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; GFX11-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF1]]
+  ; GFX11-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX11-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
   ; GFX11-NEXT:   [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_1]], 0, [[S_MOV_B32_1]], [[COPY5]], 0, implicit $exec
   ; GFX11-NEXT:   [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], killed [[V_PERMLANEX16_B32_e64_]], 228, 10, 15, 0, implicit $exec
   ; GFX11-NEXT:   [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec
@@ -244,7 +247,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_1]], implicit $exec
   ; GFX11-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
   ; GFX11-NEXT:   [[COPY6:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
-  ; GFX11-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX11-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
   ; GFX11-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.2
   ; GFX11-NEXT: {{  $}}
@@ -266,11 +269,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT: bb.4 (%ir-block.32):
   ; GFX11-NEXT:   successors: %bb.3(0x80000000)
   ; GFX11-NEXT: {{  $}}
-  ; GFX11-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
+  ; GFX11-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF3]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
   ; GFX11-NEXT:   SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX11-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
-  ; GFX11-NEXT:   early-clobber %46:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec
-  ; GFX11-NEXT:   [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %46, 0, 0, implicit $mode, implicit $exec
+  ; GFX11-NEXT:   early-clobber %47:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec
+  ; GFX11-NEXT:   [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %47, 0, 0, implicit $mode, implicit $exec
   ; GFX11-NEXT:   [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY6]]
   ; GFX11-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_5]], 0, [[V_READFIRSTLANE_B32_]], [[COPY8]], implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.3

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 311c6092918863..2b18f472c8c402 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -816,10 +816,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2027,10 +2024,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3298,10 +3292,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -4065,10 +4056,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -5361,10 +5349,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -7330,44 +7315,45 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v8, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v11, 1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -7463,9 +7449,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -7585,9 +7571,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -7691,12 +7677,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -7813,42 +7799,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -8907,44 +8893,45 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v6, 1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -9008,9 +8995,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9096,9 +9083,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9168,12 +9155,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -9259,42 +9246,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB12_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -10330,44 +10317,45 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v6, 1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -10431,9 +10419,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -10519,9 +10507,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -10591,12 +10579,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -10682,42 +10670,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB14_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -11235,44 +11223,45 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v6, 1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -11336,9 +11325,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11424,9 +11413,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11496,12 +11485,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -11587,42 +11576,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB15_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -13305,44 +13294,45 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v8, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v11, 1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -13438,9 +13428,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -13560,9 +13550,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -13666,12 +13656,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -13788,42 +13778,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 9dc82b17bd3f4d..e3144ae24ae8d6 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -718,10 +718,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -1756,10 +1753,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2794,10 +2788,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -4742,51 +4733,52 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, 0x7ff80000
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[12:13]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -4884,9 +4876,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5016,9 +5008,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5130,12 +5122,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -5264,51 +5256,50 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -6188,51 +6179,52 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
 ; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[7:8]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[5:6]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -6298,9 +6290,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x7ff80000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6381,9 +6373,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x7ff80000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6446,12 +6438,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
@@ -6550,51 +6542,50 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -8238,51 +8229,52 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, 0x7ff80000
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[12:13]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -8380,9 +8372,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8512,9 +8504,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8626,12 +8618,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -8760,51 +8752,50 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 945583c88ce267..ddc103184cdf35 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -718,10 +718,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -1756,10 +1753,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2794,10 +2788,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -4742,51 +4733,52 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, 0x7ff80000
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[12:13]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -4884,9 +4876,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5016,9 +5008,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5130,12 +5122,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -5264,51 +5256,50 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -6188,51 +6179,52 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
 ; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[7:8]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
-; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[5:6]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -6298,9 +6290,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x7ff80000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6381,9 +6373,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x7ff80000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6446,12 +6438,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
@@ -6550,51 +6542,50 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -8238,51 +8229,52 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, 0x7ff80000
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[12:13]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -8380,9 +8372,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8512,9 +8504,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8626,12 +8618,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -8760,51 +8752,50 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 3bc0f2546794d1..f353edff1b477a 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -894,10 +894,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2217,10 +2214,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3540,10 +3534,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -4359,10 +4350,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -5681,10 +5669,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v0, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -7650,44 +7635,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v8, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v11, 1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -7783,9 +7769,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -7905,9 +7891,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8011,12 +7997,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -8133,42 +8119,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -9226,44 +9212,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v6, 1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -9327,9 +9314,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9415,9 +9402,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9487,12 +9474,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -9578,42 +9565,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB12_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -10649,44 +10636,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v6, 1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -10750,9 +10738,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -10838,9 +10826,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -10910,12 +10898,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -11001,42 +10989,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB14_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -11554,44 +11542,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v6, 1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[7:8]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -11655,9 +11644,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11743,9 +11732,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v6, v4, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11815,12 +11804,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -11906,42 +11895,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB15_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -13623,44 +13612,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v8, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v11, 1
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[12:13]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -13756,9 +13746,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -13878,9 +13868,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -13984,12 +13974,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -14106,42 +14096,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
index c1b58f1795aaec..fbf8c203dcb390 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
@@ -93,18 +93,18 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ; DAGISEL11:       ; %bb.0:
 ; DAGISEL11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL11-NEXT:    s_or_saveexec_b32 s0, -1
-; DAGISEL11-NEXT:    v_dual_mov_b32 v1, v11 :: v_dual_mov_b32 v0, v10
+; DAGISEL11-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v10
 ; DAGISEL11-NEXT:    s_mov_b32 exec_lo, s0
 ; DAGISEL11-NEXT:    s_or_saveexec_b32 s0, -1
 ; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; DAGISEL11-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s0
-; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; DAGISEL11-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s0
+; DAGISEL11-NEXT:    v_cndmask_b32_e64 v2, v0, v13, s0
+; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL11-NEXT:    v_cndmask_b32_e64 v1, v1, v12, s0
 ; DAGISEL11-NEXT:    s_mov_b32 exec_lo, s0
-; DAGISEL11-NEXT:    v_mov_b32_e32 v2, v0
-; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; DAGISEL11-NEXT:    v_mov_b32_e32 v3, v1
-; DAGISEL11-NEXT:    global_store_b64 v[8:9], v[2:3], off
+; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; DAGISEL11-NEXT:    v_mov_b32_e32 v4, v2
+; DAGISEL11-NEXT:    global_store_b64 v[8:9], v[3:4], off
 ; DAGISEL11-NEXT:    s_endpgm
 ;
 ; GISEL10-LABEL: set_inactive_chain_arg_64:
@@ -127,16 +127,16 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ; DAGISEL10:       ; %bb.0:
 ; DAGISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL10-NEXT:    s_or_saveexec_b32 s0, -1
-; DAGISEL10-NEXT:    v_mov_b32_e32 v1, v11
-; DAGISEL10-NEXT:    v_mov_b32_e32 v0, v10
+; DAGISEL10-NEXT:    v_mov_b32_e32 v0, v11
+; DAGISEL10-NEXT:    v_mov_b32_e32 v1, v10
 ; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s0
 ; DAGISEL10-NEXT:    s_or_saveexec_b32 s0, -1
-; DAGISEL10-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s0
-; DAGISEL10-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s0
+; DAGISEL10-NEXT:    v_cndmask_b32_e64 v2, v0, v13, s0
+; DAGISEL10-NEXT:    v_cndmask_b32_e64 v1, v1, v12, s0
 ; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s0
-; DAGISEL10-NEXT:    v_mov_b32_e32 v2, v0
 ; DAGISEL10-NEXT:    v_mov_b32_e32 v3, v1
-; DAGISEL10-NEXT:    global_store_dwordx2 v[8:9], v[2:3], off
+; DAGISEL10-NEXT:    v_mov_b32_e32 v4, v2
+; DAGISEL10-NEXT:    global_store_dwordx2 v[8:9], v[3:4], off
 ; DAGISEL10-NEXT:    s_endpgm
 ;
 ; GISEL11_W64-LABEL: set_inactive_chain_arg_64:
@@ -162,18 +162,19 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ; DAGISEL11_W64:       ; %bb.0:
 ; DAGISEL11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v1, v11
-; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, v10
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, v11
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v1, v10
 ; DAGISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; DAGISEL11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; DAGISEL11_W64-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[0:1]
-; DAGISEL11_W64-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s[0:1]
+; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; DAGISEL11_W64-NEXT:    v_cndmask_b32_e64 v2, v0, v13, s[0:1]
+; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL11_W64-NEXT:    v_cndmask_b32_e64 v1, v1, v12, s[0:1]
 ; DAGISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
-; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v2, v0
 ; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v3, v1
-; DAGISEL11_W64-NEXT:    global_store_b64 v[8:9], v[2:3], off
+; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v4, v2
+; DAGISEL11_W64-NEXT:    global_store_b64 v[8:9], v[3:4], off
 ; DAGISEL11_W64-NEXT:    s_endpgm
 ;
 ; GISEL10_W64-LABEL: set_inactive_chain_arg_64:
@@ -196,16 +197,16 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ; DAGISEL10_W64:       ; %bb.0:
 ; DAGISEL10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v1, v11
-; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v0, v10
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v0, v11
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v1, v10
 ; DAGISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; DAGISEL10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; DAGISEL10_W64-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[0:1]
-; DAGISEL10_W64-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s[0:1]
+; DAGISEL10_W64-NEXT:    v_cndmask_b32_e64 v2, v0, v13, s[0:1]
+; DAGISEL10_W64-NEXT:    v_cndmask_b32_e64 v1, v1, v12, s[0:1]
 ; DAGISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
-; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v2, v0
 ; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v3, v1
-; DAGISEL10_W64-NEXT:    global_store_dwordx2 v[8:9], v[2:3], off
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v4, v2
+; DAGISEL10_W64-NEXT:    global_store_dwordx2 v[8:9], v[3:4], off
 ; DAGISEL10_W64-NEXT:    s_endpgm
   %tmp = call i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64 %active, i64 %inactive) #0
   %wwm = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 6dc4a2ce0504b8..6fb5a9ce47a843 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -5,16 +5,14 @@
 define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
 ; GCN-LABEL: set_inactive:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 42
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 42, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
@@ -46,19 +44,19 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
 ; GCN-LABEL: set_inactive_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -96,11 +94,9 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_buffer_load_dword s4, s[4:7], 0x0
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v1, s8
 ; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 42
-; GCN-NEXT:    s_mov_b64 exec, s[2:3]
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 42, v1, s[2:3]
 ; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 56
@@ -145,17 +141,15 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
 define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
 ; GCN-LABEL: set_inactive_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s7, 0x40400000
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x40400000
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
@@ -170,21 +164,21 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
 ; GCN-LABEL: set_inactive_f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_mov_b32 s0, 0xcccccccd
-; GCN-NEXT:    s_mov_b32 s1, 0x4010cccc
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x4010cccc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0xcccccccd
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -198,17 +192,15 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
 define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
 ; GCN-LABEL: set_inactive_v2i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s7, 0x10001
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x10001
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
@@ -222,17 +214,15 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
 define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
 ; GCN-LABEL: set_inactive_v2f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s7, 0x3c003c00
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
@@ -247,21 +237,19 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
 ; GCN-LABEL: set_inactive_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT:    s_mov_b32 s10, 1
-; GCN-NEXT:    s_mov_b32 s11, s10
-; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 1, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 1, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -276,21 +264,19 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
 ; GCN-LABEL: set_inactive_v2f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT:    s_mov_b32 s10, 1.0
-; GCN-NEXT:    s_mov_b32 s11, s10
-; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 1.0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -304,17 +290,15 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
 define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
 ; GCN-LABEL: set_inactive_v2bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s7, 0x3f803f80
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
@@ -329,21 +313,20 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
 ; GCN-LABEL: set_inactive_v4i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT:    s_mov_b32 s10, 0x10001
-; GCN-NEXT:    s_mov_b32 s11, s10
-; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x10001
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -358,21 +341,20 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half>
 ; GCN-LABEL: set_inactive_v4f16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT:    s_mov_b32 s10, 0x3c003c00
-; GCN-NEXT:    s_mov_b32 s11, s10
-; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -387,21 +369,20 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
 ; GCN-LABEL: set_inactive_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT:    s_mov_b32 s10, 0x3f803f80
-; GCN-NEXT:    s_mov_b32 s11, s10
-; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -416,19 +397,19 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
 ; GCN-LABEL: set_inactive_p0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_mov_b64 exec, -1
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -442,16 +423,14 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
 define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
 ; GCN-LABEL: set_inactive_p2:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
@@ -465,16 +444,14 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
 ; GCN-LABEL: set_inactive_p3:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
@@ -488,16 +465,14 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
 ; GCN-LABEL: set_inactive_p5:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
@@ -511,16 +486,14 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
 ; GCN-LABEL: set_inactive_p6:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b64 exec, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
index 8a5f75332557e6..b0fb24e60bead0 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
@@ -111,7 +111,7 @@ body:             |
   ; GCN-NEXT:   $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr103, 3, $vgpr5
   ; GCN-NEXT:   $sgpr22 = IMPLICIT_DEF
   ; GCN-NEXT:   renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr2
-  ; GCN-NEXT:   dead $vgpr1 = V_SET_INACTIVE_B32 $vgpr0, 0, implicit $exec, implicit-def $scc
+  ; GCN-NEXT:   dead $vgpr1 = V_SET_INACTIVE_B32 0, $vgpr0, 0, 0, $sgpr_null, implicit $exec, implicit-def $scc
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
@@ -212,7 +212,7 @@ body:             |
 
     $sgpr22 = IMPLICIT_DEF
     SI_SPILL_S32_SAVE $sgpr22, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5)
-    %0:vgpr_32 = V_SET_INACTIVE_B32 $vgpr0, 0, implicit $exec, implicit-def $scc
+    %0:vgpr_32 = V_SET_INACTIVE_B32 0, $vgpr0, 0, 0, $sgpr_null, implicit $exec, implicit-def $scc
 
   bb.1:
     KILL implicit-def $vcc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99_sgpr100_sgpr101_sgpr102_sgpr103

diff  --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index ff692acda3c255..92117e0688f65c 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1708,8 +1708,8 @@ define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in)
 ; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 0, s6, s0
 ; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s7, s0
+; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 0, s6, s0
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
@@ -1722,8 +1722,8 @@ define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in)
 ; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 0, s6, s[0:1]
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s7, s[0:1]
+; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 0, s6, s[0:1]
 ; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir
index 64a7c4457395c0..3013aabbd3bd42 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.mir
+++ b/llvm/test/CodeGen/AMDGPU/wqm.mir
@@ -40,9 +40,6 @@
   define amdgpu_vs void @no_wqm_in_vs() {
     ret void
   }
-  define amdgpu_ps void @preloaded_set_inactive() {
-    ret void
-  }
 ...
 ---
 
@@ -155,7 +152,7 @@ registers:
   - { id: 9, class: sreg_32, preferred-register: '' }
   - { id: 10, class: vgpr_32, preferred-register: '' }
   - { id: 11, class: vgpr_32, preferred-register: '' }
-  - { id: 12, class: sreg_32, preferred-register: '' }
+  - { id: 12, class: vgpr_32, preferred-register: '' }
   - { id: 13, class: vgpr_32, preferred-register: '' }
   - { id: 14, class: vgpr_32, preferred-register: '' }
   - { id: 15, class: vgpr_32, preferred-register: '' }
@@ -179,7 +176,8 @@ body:             |
     %8:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %6, %7, 0, 0, 0, implicit $exec
     %16:vgpr_32 = COPY %8.sub1
     %11:vgpr_32 = COPY %16
-    %10:vgpr_32 = V_SET_INACTIVE_B32 %11, undef %12:sreg_32, implicit $exec, implicit-def $scc
+    %17:sreg_64_xexec = IMPLICIT_DEF
+    %10:vgpr_32 = V_SET_INACTIVE_B32 0, %11, 0, undef %12, undef %17, implicit $exec, implicit-def $scc
     %14:vgpr_32 = COPY %7
     %13:vgpr_32 = V_MOV_B32_dpp %14, killed %10, 323, 12, 15, 0, implicit $exec
     early-clobber %15:vgpr_32 = STRICT_WWM killed %13, implicit $exec
@@ -298,8 +296,9 @@ body:             |
     %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
     %1:vgpr_32 = COPY $vgpr0
     %2:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN %1:vgpr_32, %0:sgpr_128, 0, 0, 0, 0, implicit $exec
-    %2.sub0:vreg_64 = V_SET_INACTIVE_B32 %2.sub0:vreg_64, 0, implicit $exec, implicit-def $scc
-    %2.sub1:vreg_64 = V_SET_INACTIVE_B32 %2.sub1:vreg_64, 0, implicit $exec, implicit-def $scc
+    %4:sreg_64_xexec = IMPLICIT_DEF
+    %2.sub0:vreg_64 = V_SET_INACTIVE_B32 0, %2.sub0:vreg_64, 0, 0, undef %4, implicit $exec, implicit-def $scc
+    %2.sub1:vreg_64 = V_SET_INACTIVE_B32 0, %2.sub1:vreg_64, 0, 0, undef %4, implicit $exec, implicit-def $scc
     %3:vreg_64 = nnan nsz arcp contract reassoc nofpexcept V_MAX_F64_e64 0, %2:vreg_64, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec
     $vgpr0 = STRICT_WWM %3.sub0:vreg_64, implicit $exec
     $vgpr1 = STRICT_WWM %3.sub1:vreg_64, implicit $exec
@@ -446,19 +445,3 @@ body:             |
 
     %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
 ...
-
----
-# Preserve V_SET_INACTIVE with exec mask already specified
-#CHECK-LABEL: name: preloaded_set_inactive
-#CHECK: V_SET_INACTIVE_B32
-name:            preloaded_set_inactive
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $vgpr1, $vgpr2
-
-    %0:vgpr_32 = COPY $vgpr1
-    %1:vgpr_32 = COPY $vgpr2
-    %mask:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    %value:vgpr_32 = V_SET_INACTIVE_B32 %0:vgpr_32, %1:vgpr_32, implicit $exec, implicit-def $scc, implicit %mask:sreg_64
-...

diff  --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index 47e1897f6b420a..b35ef6497f14ac 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -30,15 +30,15 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr40_sgpr41
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s34
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[40:41]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr40_sgpr41
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s34
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[40:41]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s34
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -170,11 +170,10 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s34
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[36:37]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s34
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -201,6 +200,9 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[34:35]
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -352,32 +354,32 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x400
 ; GFX9-O0-NEXT:    v_writelane_b32 v3, s30, 0
 ; GFX9-O0-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
-; GFX9-O0-NEXT:    s_mov_b32 s42, s6
+; GFX9-O0-NEXT:    s_mov_b32 s40, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s34, s4
-; GFX9-O0-NEXT:    ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43
-; GFX9-O0-NEXT:    s_mov_b32 s43, s7
-; GFX9-O0-NEXT:    s_mov_b32 s44, s43
-; GFX9-O0-NEXT:    s_mov_b32 s45, s42
+; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
+; GFX9-O0-NEXT:    s_mov_b32 s41, s7
+; GFX9-O0-NEXT:    s_mov_b32 s42, s41
+; GFX9-O0-NEXT:    s_mov_b32 s43, s40
 ; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    s_mov_b32 s35, s5
-; GFX9-O0-NEXT:    s_mov_b32 s46, s35
+; GFX9-O0-NEXT:    s_mov_b32 s44, s35
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s34
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
-; GFX9-O0-NEXT:    s_mov_b32 s37, s46
-; GFX9-O0-NEXT:    s_mov_b32 s38, s45
-; GFX9-O0-NEXT:    s_mov_b32 s39, s44
+; GFX9-O0-NEXT:    s_mov_b32 s37, s44
+; GFX9-O0-NEXT:    s_mov_b32 s38, s43
+; GFX9-O0-NEXT:    s_mov_b32 s39, s42
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s34
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr40_sgpr41
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s34
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[40:41]
 ; GFX9-O0-NEXT:    s_getpc_b64 s[42:43]
 ; GFX9-O0-NEXT:    s_add_u32 s42, s42, strict_wwm_called at rel32@lo+4
 ; GFX9-O0-NEXT:    s_addc_u32 s43, s43, strict_wwm_called at rel32@hi+12
@@ -396,8 +398,8 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffffc00
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s48
@@ -417,11 +419,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O3-NEXT:    v_writelane_b32 v3, s30, 0
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-O3-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    s_getpc_b64 s[36:37]
 ; GFX9-O3-NEXT:    s_add_u32 s36, s36, strict_wwm_called at rel32@lo+4
@@ -539,20 +539,20 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-LABEL: strict_wwm_call_i64:
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s50, s33
+; GFX9-O0-NEXT:    s_mov_b32 s48, s33
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
@@ -560,48 +560,59 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x1000
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; GFX9-O0-NEXT:    v_writelane_b32 v10, s30, 0
 ; GFX9-O0-NEXT:    v_writelane_b32 v10, s31, 1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[38:39], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s38, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s39, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s34, s8
-; GFX9-O0-NEXT:    s_mov_b32 s40, s6
+; GFX9-O0-NEXT:    s_mov_b32 s38, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s4
-; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
-; GFX9-O0-NEXT:    s_mov_b32 s41, s7
-; GFX9-O0-NEXT:    s_mov_b32 s35, s41
-; GFX9-O0-NEXT:    s_mov_b32 s42, s40
+; GFX9-O0-NEXT:    ; kill: def $sgpr38 killed $sgpr38 def $sgpr38_sgpr39
+; GFX9-O0-NEXT:    s_mov_b32 s39, s7
+; GFX9-O0-NEXT:    s_mov_b32 s35, s39
+; GFX9-O0-NEXT:    s_mov_b32 s44, s38
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37
 ; GFX9-O0-NEXT:    s_mov_b32 s37, s5
-; GFX9-O0-NEXT:    s_mov_b32 s43, s37
-; GFX9-O0-NEXT:    s_mov_b32 s44, s36
-; GFX9-O0-NEXT:    ; kill: def $sgpr44 killed $sgpr44 def $sgpr44_sgpr45_sgpr46_sgpr47
-; GFX9-O0-NEXT:    s_mov_b32 s45, s43
-; GFX9-O0-NEXT:    s_mov_b32 s46, s42
-; GFX9-O0-NEXT:    s_mov_b32 s47, s35
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s44, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s45, 3
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s46, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s47, 5
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[48:49], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[48:49]
+; GFX9-O0-NEXT:    s_mov_b32 s45, s37
+; GFX9-O0-NEXT:    s_mov_b32 s40, s36
+; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41_sgpr42_sgpr43
+; GFX9-O0-NEXT:    s_mov_b32 s41, s45
+; GFX9-O0-NEXT:    s_mov_b32 s42, s44
+; GFX9-O0-NEXT:    s_mov_b32 s43, s35
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s40, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s41, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s42, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s43, 3
 ; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    s_mov_b32 s35, s9
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35
+; GFX9-O0-NEXT:    s_mov_b32 s38, s35
 ; GFX9-O0-NEXT:    s_mov_b64 s[36:37], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s36
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s37
+; GFX9-O0-NEXT:    s_mov_b32 s40, s37
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr42_sgpr43
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s38
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[38:39], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s40
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[38:39]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[38:39]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s35
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 killed $sgpr34_sgpr35
+; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 killed $sgpr36_sgpr37
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s34
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s34, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s35, 5
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s36
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s[34:35]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 32
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[3:4], s34, v[8:9]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], s34, v[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX9-O0-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-O0-NEXT:    s_add_u32 s34, s34, strict_wwm_called_i64 at gotpcrel32@lo+4
 ; GFX9-O0-NEXT:    s_addc_u32 s35, s35, strict_wwm_called_i64 at gotpcrel32@hi+12
@@ -610,24 +621,24 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    s_mov_b64 s[36:37], s[0:1]
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[48:49], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[48:49]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s34, v6, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s35, v6, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s36, v6, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s37, v6, 3
-; GFX9-O0-NEXT:    v_readlane_b32 s38, v6, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s39, v6, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s34, v6, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s35, v6, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s36, v6, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s37, v6, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s38, v6, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s39, v6, 3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[48:49], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[48:49]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
@@ -645,21 +656,21 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    ; kill: killed $vgpr0
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffff000
-; GFX9-O0-NEXT:    s_mov_b32 s33, s50
+; GFX9-O0-NEXT:    s_mov_b32 s33, s48
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -671,7 +682,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
@@ -682,25 +692,25 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    v_writelane_b32 v8, s30, 0
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x800
 ; GFX9-O3-NEXT:    v_writelane_b32 v8, s31, 1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s9
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s[34:35]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-O3-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-O3-NEXT:    s_add_u32 s34, s34, strict_wwm_called_i64 at gotpcrel32@lo+4
 ; GFX9-O3-NEXT:    s_addc_u32 s35, s35, strict_wwm_called_i64 at gotpcrel32@hi+12
 ; GFX9-O3-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s8
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s9
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v7, 0, v0, s[36:37]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
+; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v3
@@ -735,8 +745,10 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s4
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
@@ -747,73 +759,82 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 5
 ; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s34, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
-; GFX9-O0-NEXT:    buffer_load_dwordx4 v[10:13], v0, s[36:39], s34 offen
-; GFX9-O0-NEXT:    buffer_load_dwordx2 v[3:4], v0, s[36:39], s34 offen offset:16
+; GFX9-O0-NEXT:    buffer_load_dwordx4 v[11:14], v0, s[36:39], s34 offen
+; GFX9-O0-NEXT:    buffer_load_dwordx2 v[4:5], v0, s[36:39], s34 offen offset:16
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s35, 0x7fffffff
-; GFX9-O0-NEXT:    s_mov_b32 s42, -1
-; GFX9-O0-NEXT:    ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43
-; GFX9-O0-NEXT:    s_mov_b32 s43, s35
+; GFX9-O0-NEXT:    s_mov_b32 s44, -1
+; GFX9-O0-NEXT:    ; kill: def $sgpr44 killed $sgpr44 def $sgpr44_sgpr45
+; GFX9-O0-NEXT:    s_mov_b32 s45, s35
+; GFX9-O0-NEXT:    s_mov_b32 s42, s45
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr46_sgpr47
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s42
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s43
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[40:41]
+; GFX9-O0-NEXT:    s_mov_b32 s35, s44
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr44_sgpr45
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s35
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[40:41]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr43
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr43
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr44_sgpr45
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s42
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s43
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[40:41]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr44_sgpr45
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s35
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[40:41]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr43
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr43
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr44_sgpr45
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s42
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s43
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[40:41]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr42_sgpr43
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s35
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[40:41]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v9
-; GFX9-O0-NEXT:    buffer_store_dwordx4 v[5:8], v0, s[36:39], s34 offen
-; GFX9-O0-NEXT:    buffer_store_dwordx2 v[3:4], v0, s[36:39], s34 offen offset:16
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v10
+; GFX9-O0-NEXT:    buffer_store_dwordx4 v[6:9], v0, s[36:39], s34 offen
+; GFX9-O0-NEXT:    buffer_store_dwordx2 v[4:5], v0, s[36:39], s34 offen offset:16
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -823,50 +844,38 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; GFX9-O3-NEXT:    buffer_load_dwordx4 v[7:10], v0, s[4:7], 0 offen
-; GFX9-O3-NEXT:    buffer_load_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16
+; GFX9-O3-NEXT:    buffer_load_dwordx4 v[8:11], v0, s[4:7], 0 offen
+; GFX9-O3-NEXT:    buffer_load_dwordx2 v[12:13], v0, s[4:7], 0 offen offset:16
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    s_mov_b32 s36, -1
-; GFX9-O3-NEXT:    s_brev_b32 s37, -2
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s36
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s37
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT:    v_bfrev_b32_e32 v1, -2
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s36
-; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s37
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s36
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s37
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v3, v1, v9, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, -1, v8, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v5, v1, v11, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v4, -1, v10, s[34:35]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v11
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, v12
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v7, v1, v13, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, -1, v12, s[34:35]
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v4
+; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v11, v5
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v12, v6
-; GFX9-O3-NEXT:    buffer_store_dwordx4 v[7:10], v0, s[4:7], 0 offen
-; GFX9-O3-NEXT:    buffer_store_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16
+; GFX9-O3-NEXT:    v_mov_b32_e32 v13, v7
+; GFX9-O3-NEXT:    buffer_store_dwordx4 v[8:11], v0, s[4:7], 0 offen
+; GFX9-O3-NEXT:    buffer_store_dwordx2 v[12:13], v0, s[4:7], 0 offen offset:16
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -874,6 +883,7 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O3-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
@@ -908,9 +918,11 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
@@ -961,110 +973,113 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v34, s5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v35, s5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s11
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s12
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s13
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s14
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s15
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s16
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v39, s17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v38, s18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v37, s19
-; GFX9-O0-NEXT:    v_mov_b32_e32 v36, s20
-; GFX9-O0-NEXT:    v_mov_b32_e32 v35, s21
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v40, s18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v39, s19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v38, s20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v37, s21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v36, s22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s23
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v46, s23
-; GFX9-O0-NEXT:    v_mov_b32_e32 v45, s24
-; GFX9-O0-NEXT:    v_mov_b32_e32 v44, s25
-; GFX9-O0-NEXT:    v_mov_b32_e32 v43, s26
-; GFX9-O0-NEXT:    v_mov_b32_e32 v42, s27
-; GFX9-O0-NEXT:    v_mov_b32_e32 v41, s28
-; GFX9-O0-NEXT:    v_mov_b32_e32 v40, s29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v46, s24
+; GFX9-O0-NEXT:    v_mov_b32_e32 v45, s25
+; GFX9-O0-NEXT:    v_mov_b32_e32 v44, s26
+; GFX9-O0-NEXT:    v_mov_b32_e32 v43, s27
+; GFX9-O0-NEXT:    v_mov_b32_e32 v42, s28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v41, s29
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v39
-; GFX9-O0-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v38
-; GFX9-O0-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v37
-; GFX9-O0-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v36
-; GFX9-O0-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v35
-; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v40
+; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v39
+; GFX9-O0-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v38
+; GFX9-O0-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v37
+; GFX9-O0-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v36
+; GFX9-O0-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v34
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v46
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v45
-; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v44
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v43
-; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v42
-; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v41
-; GFX9-O0-NEXT:    v_mov_b32_e32 v25, v40
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v35
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v46
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v45
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v44
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v43
+; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v42
+; GFX9-O0-NEXT:    v_mov_b32_e32 v25, v41
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v26, v39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v26, v40
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v27, v38
+; GFX9-O0-NEXT:    v_mov_b32_e32 v27, v39
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v28, v37
+; GFX9-O0-NEXT:    v_mov_b32_e32 v28, v38
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v29, v36
+; GFX9-O0-NEXT:    v_mov_b32_e32 v29, v37
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v30, v35
-; GFX9-O0-NEXT:    ; kill: def $vgpr31 killed $vgpr34 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v30, v36
+; GFX9-O0-NEXT:    ; kill: def $vgpr31 killed $vgpr35 killed $exec
 ; GFX9-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -1104,63 +1119,100 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
-; GFX9-O0-NEXT:    s_mov_b64 s[36:37], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s36
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v11
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    s_mov_b64 s[38:39], 0
+; GFX9-O0-NEXT:    s_mov_b32 s37, s39
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr40_sgpr41
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s37
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v32, v32, v0, s[34:35]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v11
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v10
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    s_mov_b32 s36, s38
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s36
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v33, v33, v0, s[34:35]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    v_mov_b32_e32 v34, v32
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v32
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v33
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v33
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s36
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s37
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v32, v32, v0, s[34:35]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v9
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v8
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s36
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v33, v33, v0, s[34:35]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    v_mov_b32_e32 v34, v32
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v32
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v33
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v33
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s36
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s37
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v32, v32, v0, s[34:35]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v7
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s36
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v33, v33, v0, s[34:35]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    v_mov_b32_e32 v34, v32
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v32
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v33
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v33
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s36
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s37
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v32, v32, v0, s[34:35]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v5
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s36
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v33, v33, v0, s[34:35]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    v_mov_b32_e32 v34, v32
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v32
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v33
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v33
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s36
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s37
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v32, v32, v0, s[34:35]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v3
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s36
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v33, v33, v0, s[34:35]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr36
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr36
+; GFX9-O0-NEXT:    v_mov_b32_e32 v34, v32
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v32
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v33
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v33
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v34
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v10
@@ -1219,8 +1271,10 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1242,32 +1296,32 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O3-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:16
 ; GFX9-O3-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v1, s[34:35]
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v33, 0, v2, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v1, s[34:35]
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v32
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v33
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v3, s[34:35]
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v33, 0, v4, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v3, s[34:35]
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v32
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v33
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v5, s[34:35]
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v33, 0, v6, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v5, s[34:35]
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v32
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, v33
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v7, s[34:35]
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v33, 0, v8, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v7, s[34:35]
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, v32
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v33
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v9, s[34:35]
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v33, 0, v10, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v32, 0, v9, s[34:35]
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v32
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v33

diff  --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 7f0db3e362b308..7fecab068396e0 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -26,15 +26,15 @@ define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -147,11 +147,10 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -178,6 +177,9 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -336,40 +338,40 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s1, v7, 3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
-; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
+; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX9-O0-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
 ; GFX9-O0-NEXT:    s_load_dword s2, s[0:1], 0x34
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 5
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s3, s9
-; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
-; GFX9-O0-NEXT:    s_mov_b32 s9, s17
-; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
+; GFX9-O0-NEXT:    s_mov_b32 s3, s7
+; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
+; GFX9-O0-NEXT:    s_mov_b32 s7, s9
+; GFX9-O0-NEXT:    s_mov_b32 s16, s8
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT:    s_mov_b32 s17, s9
-; GFX9-O0-NEXT:    s_mov_b32 s18, s8
+; GFX9-O0-NEXT:    s_mov_b32 s17, s7
+; GFX9-O0-NEXT:    s_mov_b32 s18, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s3
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 7
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 9
-; GFX9-O0-NEXT:    s_mov_b32 s3, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 10
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s19, 7
+; GFX9-O0-NEXT:    s_mov_b32 s6, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s6, 8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s2, 9
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s3, 10
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s3
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s2
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s6
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v0, s[2:3]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 56
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -398,13 +400,13 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 9
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -434,15 +436,13 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O3-NEXT:    s_load_dword s4, s[2:3], 0x34
 ; GFX9-O3-NEXT:    s_load_dwordx4 s[16:19], s[2:3], 0x24
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s4
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O3-NEXT:    s_add_u32 s8, s2, 56
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v3, 20, v3
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v4, 10, v4
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s[20:21]
 ; GFX9-O3-NEXT:    s_addc_u32 s9, s3, 0
 ; GFX9-O3-NEXT:    v_or3_b32 v3, v5, v4, v3
 ; GFX9-O3-NEXT:    s_mov_b64 s[4:5], s[0:1]
@@ -587,41 +587,56 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    v_readlane_b32 s1, v12, 3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
-; GFX9-O0-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x2c
+; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s8, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s9, 5
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s6, s19
-; GFX9-O0-NEXT:    s_mov_b32 s7, s18
-; GFX9-O0-NEXT:    s_mov_b32 s15, s17
+; GFX9-O0-NEXT:    s_mov_b32 s6, s9
+; GFX9-O0-NEXT:    s_mov_b32 s7, s8
+; GFX9-O0-NEXT:    s_mov_b32 s8, s17
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT:    s_mov_b32 s17, s15
+; GFX9-O0-NEXT:    s_mov_b32 s17, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s18, s7
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 7
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 9
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s19, 7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s7
+; GFX9-O0-NEXT:    s_mov_b32 s15, s7
+; GFX9-O0-NEXT:    s_mov_b32 s8, s3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr16_sgpr17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s15
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v0, s[8:9]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s3
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
+; GFX9-O0-NEXT:    ; kill: def $sgpr2 killed $sgpr2 killed $sgpr2_sgpr3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s2, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s3, 9
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v0, s[2:3]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v6
+; GFX9-O0-NEXT:    s_mov_b32 s2, 32
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[10:11], s2, v[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 60
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -631,11 +646,6 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    s_addc_u32 s0, s0, s1
 ; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
 ; GFX9-O0-NEXT:    s_mov_b32 s9, s0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT:    s_mov_b32 s0, 32
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[10:11], s0, v[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
 ; GFX9-O0-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-O0-NEXT:    s_add_u32 s0, s0, called_i64 at gotpcrel32@lo+4
 ; GFX9-O0-NEXT:    s_addc_u32 s1, s1, called_i64 at gotpcrel32@hi+12
@@ -650,20 +660,20 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr15
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 9
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -701,14 +711,13 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O3-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
 ; GFX9-O3-NEXT:    s_load_dwordx4 s[16:19], s[2:3], 0x24
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s4
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s5
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s[8:9]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O3-NEXT:    s_add_u32 s8, s2, 60
 ; GFX9-O3-NEXT:    s_addc_u32 s9, s3, 0
 ; GFX9-O3-NEXT:    s_getpc_b64 s[2:3]
@@ -717,6 +726,7 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O3-NEXT:    s_load_dwordx2 s[22:23], s[2:3], 0x0
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v3, 20, v3
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v4, 10, v4
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v7, 0, v0, s[20:21]
 ; GFX9-O3-NEXT:    v_or3_b32 v3, v5, v4, v3
 ; GFX9-O3-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX9-O3-NEXT:    s_mov_b64 s[0:1], s[24:25]
@@ -724,14 +734,14 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O3-NEXT:    s_mov_b32 s13, s7
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v31, v3
 ; GFX9-O3-NEXT:    s_mov_b64 s[2:3], s[26:27]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[22:23]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O3-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-O3-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v7, vcc
+; GFX9-O3-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
+; GFX9-O3-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v4
@@ -763,109 +773,103 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 5
 ; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s4, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0
-; GFX9-O0-NEXT:    buffer_load_dwordx4 v[10:13], v0, s[0:3], s4 offen
-; GFX9-O0-NEXT:    buffer_load_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16
+; GFX9-O0-NEXT:    buffer_load_dwordx4 v[11:14], v0, s[0:3], s4 offen
+; GFX9-O0-NEXT:    buffer_load_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 0x7fffffff
-; GFX9-O0-NEXT:    s_mov_b32 s8, -1
-; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
-; GFX9-O0-NEXT:    s_mov_b32 s9, s5
+; GFX9-O0-NEXT:    s_mov_b32 s10, -1
+; GFX9-O0-NEXT:    ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11
+; GFX9-O0-NEXT:    s_mov_b32 s11, s5
+; GFX9-O0-NEXT:    s_mov_b32 s8, s11
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[6:7]
+; GFX9-O0-NEXT:    s_mov_b32 s5, s10
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[6:7]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[6:7]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[6:7]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[6:7]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v9
-; GFX9-O0-NEXT:    buffer_store_dwordx4 v[5:8], v0, s[0:3], s4 offen
-; GFX9-O0-NEXT:    buffer_store_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v10
+; GFX9-O0-NEXT:    buffer_store_dwordx4 v[6:9], v0, s[0:3], s4 offen
+; GFX9-O0-NEXT:    buffer_store_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16
 ; GFX9-O0-NEXT:    s_endpgm
 ;
 ; GFX9-O3-LABEL: _amdgpu_cs_main:
 ; GFX9-O3:       ; %bb.0:
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; GFX9-O3-NEXT:    buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen
-; GFX9-O3-NEXT:    buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
+; GFX9-O3-NEXT:    buffer_load_dwordx4 v[8:11], v0, s[0:3], 0 offen
+; GFX9-O3-NEXT:    buffer_load_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX9-O3-NEXT:    s_mov_b32 s6, -1
-; GFX9-O3-NEXT:    s_brev_b32 s7, -2
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    v_bfrev_b32_e32 v1, -2
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s7
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s7
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v3, v1, v9, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, -1, v8, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v5, v1, v11, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v4, -1, v10, s[4:5]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v11
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, v12
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v7, v1, v13, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, -1, v12, s[4:5]
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v4
+; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v11, v5
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v12, v6
-; GFX9-O3-NEXT:    buffer_store_dwordx4 v[7:10], v0, s[0:3], 0 offen
-; GFX9-O3-NEXT:    buffer_store_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
+; GFX9-O3-NEXT:    v_mov_b32_e32 v13, v7
+; GFX9-O3-NEXT:    buffer_store_dwordx4 v[8:11], v0, s[0:3], 0 offen
+; GFX9-O3-NEXT:    buffer_store_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16
 ; GFX9-O3-NEXT:    s_endpgm
   %tmp17 = shl i32 %index, 5
   %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0)
@@ -915,15 +919,15 @@ define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -1036,11 +1040,10 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -1067,6 +1070,9 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -1225,40 +1231,40 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    v_readlane_b32 s1, v7, 3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
-; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
+; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX9-O0-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
 ; GFX9-O0-NEXT:    s_load_dword s2, s[0:1], 0x34
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 5
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s3, s9
-; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
-; GFX9-O0-NEXT:    s_mov_b32 s9, s17
-; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
+; GFX9-O0-NEXT:    s_mov_b32 s3, s7
+; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
+; GFX9-O0-NEXT:    s_mov_b32 s7, s9
+; GFX9-O0-NEXT:    s_mov_b32 s16, s8
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT:    s_mov_b32 s17, s9
-; GFX9-O0-NEXT:    s_mov_b32 s18, s8
+; GFX9-O0-NEXT:    s_mov_b32 s17, s7
+; GFX9-O0-NEXT:    s_mov_b32 s18, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s3
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 7
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 9
-; GFX9-O0-NEXT:    s_mov_b32 s3, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 10
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s19, 7
+; GFX9-O0-NEXT:    s_mov_b32 s6, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s6, 8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s2, 9
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s3, 10
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s3
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s2
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s6
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v0, s[2:3]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 56
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -1287,13 +1293,13 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 9
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -1323,15 +1329,13 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O3-NEXT:    s_load_dword s4, s[2:3], 0x34
 ; GFX9-O3-NEXT:    s_load_dwordx4 s[16:19], s[2:3], 0x24
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s4
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O3-NEXT:    s_add_u32 s8, s2, 56
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v3, 20, v3
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v4, 10, v4
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s[20:21]
 ; GFX9-O3-NEXT:    s_addc_u32 s9, s3, 0
 ; GFX9-O3-NEXT:    v_or3_b32 v3, v5, v4, v3
 ; GFX9-O3-NEXT:    s_mov_b64 s[4:5], s[0:1]
@@ -1476,41 +1480,56 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    v_readlane_b32 s1, v12, 3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
-; GFX9-O0-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x2c
+; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s8, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s9, 5
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s6, s19
-; GFX9-O0-NEXT:    s_mov_b32 s7, s18
-; GFX9-O0-NEXT:    s_mov_b32 s15, s17
+; GFX9-O0-NEXT:    s_mov_b32 s6, s9
+; GFX9-O0-NEXT:    s_mov_b32 s7, s8
+; GFX9-O0-NEXT:    s_mov_b32 s8, s17
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT:    s_mov_b32 s17, s15
+; GFX9-O0-NEXT:    s_mov_b32 s17, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s18, s7
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 7
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 9
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s19, 7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s7
+; GFX9-O0-NEXT:    s_mov_b32 s15, s7
+; GFX9-O0-NEXT:    s_mov_b32 s8, s3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr16_sgpr17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s15
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v0, s[8:9]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s3
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
+; GFX9-O0-NEXT:    ; kill: def $sgpr2 killed $sgpr2 killed $sgpr2_sgpr3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s2, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v1, s3, 9
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v0, s[2:3]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v6
+; GFX9-O0-NEXT:    s_mov_b32 s2, 32
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[10:11], s2, v[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 60
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -1520,11 +1539,6 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    s_addc_u32 s0, s0, s1
 ; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
 ; GFX9-O0-NEXT:    s_mov_b32 s9, s0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT:    s_mov_b32 s0, 32
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[10:11], s0, v[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
 ; GFX9-O0-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-O0-NEXT:    s_add_u32 s0, s0, strict_wwm_called_i64 at gotpcrel32@lo+4
 ; GFX9-O0-NEXT:    s_addc_u32 s1, s1, strict_wwm_called_i64 at gotpcrel32@hi+12
@@ -1539,20 +1553,20 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr15
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 9
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -1590,14 +1604,13 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O3-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
 ; GFX9-O3-NEXT:    s_load_dwordx4 s[16:19], s[2:3], 0x24
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s4
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s5
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s[8:9]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O3-NEXT:    s_add_u32 s8, s2, 60
 ; GFX9-O3-NEXT:    s_addc_u32 s9, s3, 0
 ; GFX9-O3-NEXT:    s_getpc_b64 s[2:3]
@@ -1606,6 +1619,7 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O3-NEXT:    s_load_dwordx2 s[22:23], s[2:3], 0x0
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v3, 20, v3
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v4, 10, v4
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v7, 0, v0, s[20:21]
 ; GFX9-O3-NEXT:    v_or3_b32 v3, v5, v4, v3
 ; GFX9-O3-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX9-O3-NEXT:    s_mov_b64 s[0:1], s[24:25]
@@ -1613,14 +1627,14 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O3-NEXT:    s_mov_b32 s13, s7
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v31, v3
 ; GFX9-O3-NEXT:    s_mov_b64 s[2:3], s[26:27]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[22:23]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O3-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-O3-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v7, vcc
+; GFX9-O3-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
+; GFX9-O3-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v4
@@ -1652,109 +1666,103 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 5
 ; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s4, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0
-; GFX9-O0-NEXT:    buffer_load_dwordx4 v[10:13], v0, s[0:3], s4 offen
-; GFX9-O0-NEXT:    buffer_load_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16
+; GFX9-O0-NEXT:    buffer_load_dwordx4 v[11:14], v0, s[0:3], s4 offen
+; GFX9-O0-NEXT:    buffer_load_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 0x7fffffff
-; GFX9-O0-NEXT:    s_mov_b32 s8, -1
-; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
-; GFX9-O0-NEXT:    s_mov_b32 s9, s5
+; GFX9-O0-NEXT:    s_mov_b32 s10, -1
+; GFX9-O0-NEXT:    ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11
+; GFX9-O0-NEXT:    s_mov_b32 s11, s5
+; GFX9-O0-NEXT:    s_mov_b32 s8, s11
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[6:7]
+; GFX9-O0-NEXT:    s_mov_b32 s5, s10
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[6:7]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[6:7]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[6:7]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[6:7]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v9
-; GFX9-O0-NEXT:    buffer_store_dwordx4 v[5:8], v0, s[0:3], s4 offen
-; GFX9-O0-NEXT:    buffer_store_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v10
+; GFX9-O0-NEXT:    buffer_store_dwordx4 v[6:9], v0, s[0:3], s4 offen
+; GFX9-O0-NEXT:    buffer_store_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16
 ; GFX9-O0-NEXT:    s_endpgm
 ;
 ; GFX9-O3-LABEL: strict_wwm_amdgpu_cs_main:
 ; GFX9-O3:       ; %bb.0:
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; GFX9-O3-NEXT:    buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen
-; GFX9-O3-NEXT:    buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
+; GFX9-O3-NEXT:    buffer_load_dwordx4 v[8:11], v0, s[0:3], 0 offen
+; GFX9-O3-NEXT:    buffer_load_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX9-O3-NEXT:    s_mov_b32 s6, -1
-; GFX9-O3-NEXT:    s_brev_b32 s7, -2
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    v_bfrev_b32_e32 v1, -2
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s7
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s7
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v3, v1, v9, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, -1, v8, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v5, v1, v11, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v4, -1, v10, s[4:5]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v11
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, v12
-; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v7, v1, v13, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, -1, v12, s[4:5]
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v4
+; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v11, v5
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v12, v6
-; GFX9-O3-NEXT:    buffer_store_dwordx4 v[7:10], v0, s[0:3], 0 offen
-; GFX9-O3-NEXT:    buffer_store_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
+; GFX9-O3-NEXT:    v_mov_b32_e32 v13, v7
+; GFX9-O3-NEXT:    buffer_store_dwordx4 v[8:11], v0, s[0:3], 0 offen
+; GFX9-O3-NEXT:    buffer_store_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16
 ; GFX9-O3-NEXT:    s_endpgm
   %tmp17 = shl i32 %index, 5
   %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0)


        


More information about the llvm-commits mailing list