[llvm] [AMDGPU] Improve codegen for GFX10+ DPP reductions and scans (PR #107108)

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 3 06:34:24 PDT 2024


https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/107108

Use poison for an unused input to the permlanex16 intrinsic, to improve
register allocation and avoid an unnecessary v_mov instruction.


>From 0185b940793a82e66503a38b40e779c5030d8e71 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 3 Sep 2024 13:16:49 +0100
Subject: [PATCH] [AMDGPU] Improve codegen for GFX10+ DPP reductions and scans

Use poison for an unused input to the permlanex16 intrinsic, to improve
register allocation and avoid an unnecessary v_mov instruction.
---
 .../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp   |  18 +-
 .../global-atomic-fadd.f32-no-rtn.ll          |   9 +-
 .../GlobalISel/global-atomic-fadd.f32-rtn.ll  |  27 +-
 .../atomic_optimizations_global_pointer.ll    | 370 ++++----
 .../atomic_optimizations_local_pointer.ll     | 873 ++++++++----------
 .../atomic_optimizations_pixelshader.ll       |  24 +-
 .../AMDGPU/global-atomic-fadd.f32-no-rtn.ll   |   5 +-
 .../AMDGPU/global-atomic-fadd.f32-rtn.ll      |  22 +-
 .../AMDGPU/global_atomics_scan_fadd.ll        | 250 ++---
 .../AMDGPU/global_atomics_scan_fmax.ll        | 156 ++--
 .../AMDGPU/global_atomics_scan_fmin.ll        | 156 ++--
 .../AMDGPU/global_atomics_scan_fsub.ll        | 250 ++---
 12 files changed, 877 insertions(+), 1283 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 95afc3fcc8d7d1..f408a013d7a379 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -421,9 +421,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
 
   // Reduce within each pair of rows (i.e. 32 lanes).
   assert(ST->hasPermLaneX16());
-  Value *Permlanex16Call = B.CreateIntrinsic(
-      V->getType(), Intrinsic::amdgcn_permlanex16,
-      {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
+  Value *Permlanex16Call =
+      B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16,
+                        {PoisonValue::get(AtomicTy), V, B.getInt32(0),
+                         B.getInt32(0), B.getFalse(), B.getFalse()});
   V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call);
   if (ST->isWave32()) {
     return V;
@@ -432,7 +433,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
   if (ST->hasPermLane64()) {
     // Reduce across the upper and lower 32 lanes.
     Value *Permlane64Call =
-        B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V);
+        B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlane64, V);
     return buildNonAtomicBinOp(B, Op, V, Permlane64Call);
   }
 
@@ -481,9 +482,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
     // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
     // 48..63).
     assert(ST->hasPermLaneX16());
-    Value *PermX = B.CreateIntrinsic(
-        V->getType(), Intrinsic::amdgcn_permlanex16,
-        {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
+    Value *PermX =
+        B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16,
+                          {PoisonValue::get(AtomicTy), V, B.getInt32(-1),
+                           B.getInt32(-1), B.getFalse(), B.getFalse()});
 
     Value *UpdateDPPCall = B.CreateCall(
         UpdateDPP, {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
@@ -493,7 +495,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
     if (!ST->isWave32()) {
       // Combine lane 31 into lanes 32..63.
       Value *const Lane31 = B.CreateIntrinsic(
-          V->getType(), Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});
+          AtomicTy, Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});
 
       Value *UpdateDPPCall = B.CreateCall(
           UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
index fcd8c6fb0fe7c6..9c634aba348d23 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
@@ -285,6 +285,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX11-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
   ; GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX11-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GFX11-NEXT:   [[SI_PS_LIVE:%[0-9]+]]:sreg_32_xm0_xexec = SI_PS_LIVE
   ; GFX11-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.2
@@ -312,12 +313,12 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX11-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
   ; GFX11-NEXT:   [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY10]], [[V_ADD_F32_e64_2]], 360, 15, 15, 0, implicit $exec
   ; GFX11-NEXT:   [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
-  ; GFX11-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-  ; GFX11-NEXT:   [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_2]], 0, [[S_MOV_B32_2]], [[V_ADD_F32_e64_3]], 0, implicit $exec
+  ; GFX11-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+  ; GFX11-NEXT:   [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_]], 0, [[S_MOV_B32_]], [[COPY11]], 0, implicit $exec
   ; GFX11-NEXT:   [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_PERMLANEX16_B32_e64_]], 0, 0, implicit $mode, implicit $exec
   ; GFX11-NEXT:   [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_ADD_F32_e64_4]], implicit $exec
-  ; GFX11-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GFX11-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], [[COPY11]], implicit $exec
+  ; GFX11-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GFX11-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], [[COPY12]], implicit $exec
   ; GFX11-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.3
   ; GFX11-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
index a8f9ed2e6fba93..fdce9d9258c887 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
@@ -269,22 +269,23 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY10]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec
   ; GFX11-NEXT:   [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
   ; GFX11-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-  ; GFX11-NEXT:   [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_2]], 0, [[S_MOV_B32_2]], [[V_ADD_F32_e64_3]], 0, implicit $exec
-  ; GFX11-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; GFX11-NEXT:   [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY11]], [[V_PERMLANEX16_B32_e64_]], 228, 10, 15, 0, implicit $exec
-  ; GFX11-NEXT:   [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec
+  ; GFX11-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+  ; GFX11-NEXT:   [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_2]], 0, [[S_MOV_B32_2]], [[COPY11]], 0, implicit $exec
   ; GFX11-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; GFX11-NEXT:   [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_ADD_F32_e64_4]], 273, 15, 15, 0, implicit $exec
+  ; GFX11-NEXT:   [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_PERMLANEX16_B32_e64_]], 228, 10, 15, 0, implicit $exec
+  ; GFX11-NEXT:   [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec
+  ; GFX11-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; GFX11-NEXT:   [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_4]], 273, 15, 15, 0, implicit $exec
   ; GFX11-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 15
   ; GFX11-NEXT:   [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], [[S_MOV_B32_3]]
   ; GFX11-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 16
   ; GFX11-NEXT:   [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READLANE_B32_]], [[S_MOV_B32_4]], [[V_MOV_B32_dpp5]]
   ; GFX11-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 31
   ; GFX11-NEXT:   [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], [[S_MOV_B32_5]]
-  ; GFX11-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_1]]
-  ; GFX11-NEXT:   [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY13]], implicit $exec
-  ; GFX11-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GFX11-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], [[COPY14]], implicit $exec
+  ; GFX11-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_1]]
+  ; GFX11-NEXT:   [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY14]], implicit $exec
+  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GFX11-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], [[COPY15]], implicit $exec
   ; GFX11-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.3
   ; GFX11-NEXT: {{  $}}
@@ -298,7 +299,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT: bb.4.Flow:
   ; GFX11-NEXT:   successors: %bb.6(0x80000000)
   ; GFX11-NEXT: {{  $}}
-  ; GFX11-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI %41, %bb.5, [[DEF]], %bb.1
+  ; GFX11-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI %42, %bb.5, [[DEF]], %bb.1
   ; GFX11-NEXT:   SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.6
   ; GFX11-NEXT: {{  $}}
@@ -309,10 +310,10 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX11-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
   ; GFX11-NEXT:   [[STRICT_WWM1:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec
-  ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
-  ; GFX11-NEXT:   [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY15]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec
   ; GFX11-NEXT:   [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
-  ; GFX11-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_ADD_F32_e64_5]], 0, [[COPY16]], [[V_CMP_EQ_U32_e64_]], implicit $exec
+  ; GFX11-NEXT:   [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY16]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX11-NEXT:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX11-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_ADD_F32_e64_5]], 0, [[COPY17]], [[V_CMP_EQ_U32_e64_]], implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.4
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT: bb.6 (%ir-block.38):
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 16f3ff4be6b501..1916bfa1f3fc96 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -1243,8 +1243,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
@@ -1306,8 +1305,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
@@ -1365,15 +1363,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -1439,25 +1436,23 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 31
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s5, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s4, s6
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
@@ -1504,15 +1499,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1264_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -1578,25 +1572,23 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1232_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1232_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1232_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_readlane_b32 s6, v1, 31
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_readlane_b32 s5, v1, 15
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_writelane_b32 v3, s5, 16
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1232_DPP-NEXT:    s_mov_b32 s4, s6
 ; GFX1232_DPP-NEXT:    s_mov_b32 s6, -1
@@ -3119,23 +3111,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v3, 31
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v4, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s5
@@ -3234,23 +3224,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s8, v4, 15
@@ -3317,7 +3305,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1164_DPP-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -3332,27 +3319,25 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v4, 31
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s5
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v3, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v3, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
@@ -3434,13 +3419,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -3451,31 +3435,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s6, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v4, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s7, v4, 15
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s8, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s6
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s6, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s7, 16
@@ -3532,13 +3515,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1264_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -3549,29 +3531,27 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1264_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1264_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s5, v4, 31
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v7, s5
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264_DPP-NEXT:    v_readlane_b32 s4, v3, 31
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v7, s5
+; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT:    v_readlane_b32 s4, v3, 31
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -3656,7 +3636,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v2
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -3669,23 +3649,22 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1232_DPP-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1232_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1232_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1232_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1232_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1232_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1232_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s6, -1
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
+; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1232_DPP-NEXT:    v_readlane_b32 s4, v4, 31
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_readlane_b32 s7, v4, 15
@@ -4974,8 +4953,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
@@ -5037,8 +5015,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
@@ -5096,15 +5073,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -5170,25 +5146,23 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 31
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s5, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s4, s6
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
@@ -5235,15 +5209,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1264_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -5309,25 +5282,23 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1232_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1232_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1232_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_readlane_b32 s6, v1, 31
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_readlane_b32 s5, v1, 15
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_writelane_b32 v3, s5, 16
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1232_DPP-NEXT:    s_mov_b32 s4, s6
 ; GFX1232_DPP-NEXT:    s_mov_b32 s6, -1
@@ -6892,23 +6863,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v3, 31
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v4, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s5
@@ -7007,23 +6976,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s8, v4, 15
@@ -7090,7 +7057,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1164_DPP-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -7105,27 +7071,25 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v4, 31
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s5
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v3, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v3, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
@@ -7207,13 +7171,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -7224,31 +7187,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s6, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v4, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s7, v4, 15
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s8, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s6
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s6, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s7, 16
@@ -7305,13 +7267,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1264_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -7322,29 +7283,27 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1264_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1264_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1264_DPP-NEXT:    v_readlane_b32 s5, v4, 31
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc
-; GFX1264_DPP-NEXT:    v_mov_b32_e32 v7, s5
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264_DPP-NEXT:    v_readlane_b32 s4, v3, 31
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v7, s5
+; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT:    v_readlane_b32 s4, v3, 31
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
 ; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -7429,7 +7388,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v2
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
 ; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -7442,23 +7401,22 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1232_DPP-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1232_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1232_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1232_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1232_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1232_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1232_DPP-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s6, -1
 ; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
+; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1232_DPP-NEXT:    v_readlane_b32 s4, v4, 31
 ; GFX1232_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1232_DPP-NEXT:    v_readlane_b32 s7, v4, 15
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index c7296185422cef..6d0e0cc7869b3f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -880,8 +880,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
@@ -936,8 +935,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -986,21 +984,20 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -1053,15 +1050,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
@@ -1394,8 +1390,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1428,8 +1423,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
@@ -1460,16 +1454,14 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -1502,16 +1494,14 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1
 ; GFX1132_DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
@@ -2623,23 +2613,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v3, 31
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v4, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s5
@@ -2732,20 +2720,18 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v4, 15
@@ -2807,7 +2793,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1164_DPP-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -2822,27 +2807,25 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v4, 31
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s5
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v3, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v3, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
@@ -2919,13 +2902,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -2936,28 +2918,28 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v4, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v4, 15
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s5, 16
@@ -3434,10 +3416,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v3, v1
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v4, v2, vcc
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v3, v1, 0, 0
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v3
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v4, vcc
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -3500,10 +3480,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v3, v1
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v3, v1, 0, 0
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -3560,11 +3538,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v3
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v3, 0, 0
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v1, 0, 0
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_add_co_u32 v2, vcc, v3, v2
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
@@ -3630,12 +3605,9 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v3, 0, 0
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v1, 0, 0
 ; GFX1132_DPP-NEXT:    v_add_co_u32 v2, vcc_lo, v3, v2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v1, v4, vcc_lo
@@ -4528,8 +4500,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
@@ -4584,8 +4555,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -4634,21 +4604,20 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -4701,15 +4670,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
@@ -5042,8 +5010,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5076,8 +5043,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
@@ -5108,16 +5074,14 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -5150,16 +5114,14 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1
 ; GFX1132_DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
@@ -6297,23 +6259,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v3, 31
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v4, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s5
@@ -6406,20 +6366,18 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v8, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v4, 15
@@ -6481,7 +6439,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1164_DPP-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -6496,27 +6453,25 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v4, 31
-; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s5
-; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v3, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v3, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
@@ -6593,13 +6548,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -6610,28 +6564,28 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v4, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v4, 15
-; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v3, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s5, 16
@@ -7093,8 +7047,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
@@ -7149,8 +7102,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -7199,21 +7151,20 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -7266,20 +7217,18 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
@@ -7811,10 +7760,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -7894,10 +7841,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v1, 31
@@ -7965,11 +7910,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
@@ -8061,16 +8003,14 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v1, 31
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v2, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -8535,8 +8475,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
@@ -8591,8 +8530,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -8641,21 +8579,20 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -8708,15 +8645,14 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
@@ -9252,10 +9188,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -9335,10 +9269,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v1, 31
@@ -9406,11 +9338,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
@@ -9502,16 +9431,14 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v1, 31
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v2, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -9976,8 +9903,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
@@ -10032,8 +9958,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -10082,21 +10007,20 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -10149,15 +10073,14 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
@@ -10693,10 +10616,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -10776,10 +10697,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v1, 31
@@ -10847,11 +10766,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
@@ -10943,16 +10859,14 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v1, 31
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v2, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -11421,8 +11335,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064_DPP-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
@@ -11477,8 +11390,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032_DPP-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -11527,21 +11439,20 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -11594,20 +11505,18 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132_DPP-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
@@ -12521,24 +12430,22 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v3
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v4, 31
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, s4
@@ -12639,22 +12546,20 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v3
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v4, 31
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v3, 31
@@ -12738,35 +12643,32 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v4, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -12871,25 +12773,23 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v4, 31
@@ -13361,8 +13261,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064_DPP-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
@@ -13417,8 +13316,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032_DPP-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -13467,21 +13365,20 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -13534,20 +13431,18 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132_DPP-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
@@ -14457,24 +14352,22 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v3
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v4, 31
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, s4
@@ -14574,22 +14467,20 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v3
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v4, 31
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -14672,35 +14563,32 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v4, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -14805,25 +14693,23 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v3, 31
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v4, 31
@@ -15290,8 +15176,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064_DPP-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
@@ -15346,8 +15231,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032_DPP-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -15396,21 +15280,20 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -15463,15 +15346,14 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132_DPP-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
@@ -16375,24 +16257,22 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v3
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v4, 31
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, s4
@@ -16491,22 +16371,20 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v3
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v4, 31
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v3, 31
@@ -16588,35 +16466,32 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v4, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -16719,25 +16594,23 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v4, 31
@@ -17209,8 +17082,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064_DPP-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
@@ -17265,8 +17137,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032_DPP-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v1, 31
@@ -17315,21 +17186,20 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164_DPP-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164_DPP-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v1, 15
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s4, 16
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 47
@@ -17382,20 +17252,18 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132_DPP-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
@@ -18295,24 +18163,22 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v3
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v4, 31
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, s4
@@ -18411,22 +18277,20 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v3
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v4, 31
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v3, 31
@@ -18508,35 +18372,32 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v1
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v4, 31
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v3, 31
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -18639,25 +18500,23 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
 ; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
 ; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v8, v3, -1, -1
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v7, v4, -1, -1
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v3, 15
 ; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v4, 31
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index f67fcd6e0caf53..22eb8d05b5ff23 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -396,8 +396,7 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1064-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1064-NEXT:    v_readlane_b32 s12, v1, 31
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s12
@@ -460,8 +459,7 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1032-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032-NEXT:    v_readlane_b32 s11, v1, 31
 ; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -518,21 +516,19 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s12, v1, 31
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_readlane_b32 s12, v1, 31
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, s12
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-NEXT:    v_readlane_b32 s12, v1, 15
 ; GFX1164-NEXT:    v_readlane_b32 s13, v1, 31
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_writelane_b32 v3, s12, 16
 ; GFX1164-NEXT:    s_mov_b64 exec, s[10:11]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-NEXT:    s_or_saveexec_b64 s[10:11], -1
 ; GFX1164-NEXT:    v_readlane_b32 s12, v1, 63
@@ -594,20 +590,18 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
 ; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_readlane_b32 s11, v1, 31
 ; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-NEXT:    v_readlane_b32 s10, v1, 15
 ; GFX1132-NEXT:    s_mov_b32 exec_lo, s9
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-NEXT:    s_or_saveexec_b32 s9, -1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-NEXT:    v_writelane_b32 v3, s10, 16
 ; GFX1132-NEXT:    s_mov_b32 exec_lo, s9
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132-NEXT:    s_and_saveexec_b32 s9, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
index ad3f920eadc91f..6bd0b11acc3ea2 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
@@ -186,8 +186,9 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX11_GFX12-NEXT:   [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 360, 15, 15, 0, implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
-  ; GFX11_GFX12-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-  ; GFX11_GFX12-NEXT:   [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_1]], 0, [[S_MOV_B32_1]], [[V_ADD_F32_e64_3]], 0, implicit $exec
+  ; GFX11_GFX12-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX11_GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+  ; GFX11_GFX12-NEXT:   [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_]], 0, [[S_MOV_B32_]], [[COPY5]], 0, implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_PERMLANEX16_B32_e64_]], 0, 0, implicit $mode, implicit $exec
   ; GFX11_GFX12-NEXT:   early-clobber %1:vgpr_32 = STRICT_WWM killed [[V_ADD_F32_e64_4]], implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
index 3951e02d46a8f3..6766c0c1fdaeb0 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
@@ -229,7 +229,9 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec
   ; GFX11-NEXT:   [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
   ; GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-  ; GFX11-NEXT:   [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_1]], 0, [[S_MOV_B32_1]], [[V_ADD_F32_e64_3]], 0, implicit $exec
+  ; GFX11-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX11-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF1]]
+  ; GFX11-NEXT:   [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_1]], 0, [[S_MOV_B32_1]], [[COPY5]], 0, implicit $exec
   ; GFX11-NEXT:   [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], killed [[V_PERMLANEX16_B32_e64_]], 228, 10, 15, 0, implicit $exec
   ; GFX11-NEXT:   [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec
   ; GFX11-NEXT:   [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 273, 15, 15, 0, implicit $exec
@@ -241,8 +243,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], killed [[S_MOV_B32_4]]
   ; GFX11-NEXT:   early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_1]], implicit $exec
   ; GFX11-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
-  ; GFX11-NEXT:   [[COPY5:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
-  ; GFX11-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX11-NEXT:   [[COPY6:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
+  ; GFX11-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
   ; GFX11-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.2
   ; GFX11-NEXT: {{  $}}
@@ -250,8 +252,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   successors: %bb.4(0x80000000)
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX11-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY %2
-  ; GFX11-NEXT:   [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY6]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
+  ; GFX11-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY %2
+  ; GFX11-NEXT:   [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY7]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
   ; GFX11-NEXT:   S_BRANCH %bb.4
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT: bb.3.Flow:
@@ -264,13 +266,13 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT: bb.4 (%ir-block.32):
   ; GFX11-NEXT:   successors: %bb.3(0x80000000)
   ; GFX11-NEXT: {{  $}}
-  ; GFX11-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
+  ; GFX11-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
   ; GFX11-NEXT:   SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX11-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
-  ; GFX11-NEXT:   early-clobber %44:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec
-  ; GFX11-NEXT:   [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec
-  ; GFX11-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY5]]
-  ; GFX11-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_5]], 0, [[V_READFIRSTLANE_B32_]], [[COPY7]], implicit $exec
+  ; GFX11-NEXT:   early-clobber %46:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec
+  ; GFX11-NEXT:   [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %46, 0, 0, implicit $mode, implicit $exec
+  ; GFX11-NEXT:   [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY6]]
+  ; GFX11-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_5]], 0, [[V_READFIRSTLANE_B32_]], [[COPY8]], implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.3
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT: bb.5 (%ir-block.38):
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 44cd2c6e3af675..584b280cefb8a8 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -917,8 +917,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
@@ -999,8 +998,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1070,16 +1068,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -1139,15 +1135,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -2155,8 +2150,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
@@ -2237,8 +2231,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2308,16 +2301,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -2377,15 +2368,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -3453,8 +3443,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
@@ -3535,8 +3524,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3606,16 +3594,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -3675,15 +3661,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -4247,8 +4232,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
@@ -4329,8 +4313,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4400,16 +4383,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -4469,15 +4450,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -5570,8 +5550,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
@@ -5652,8 +5631,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5723,16 +5701,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -5805,16 +5781,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -7630,10 +7605,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v9, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v8, 0
@@ -7760,10 +7733,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7881,11 +7852,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v11, v9
@@ -8007,20 +7975,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -9212,10 +9178,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v4, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
@@ -9308,10 +9272,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9395,11 +9357,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v5, v3
@@ -9489,20 +9448,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB12_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -10671,10 +10628,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v4, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
@@ -10767,10 +10722,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -10854,11 +10807,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v5, v3
@@ -10948,20 +10898,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB14_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -11612,10 +11560,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v4, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
@@ -11708,10 +11654,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -11795,11 +11739,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v5, v3
@@ -11889,20 +11830,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB15_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -13750,10 +13689,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v9, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v8, 0
@@ -13880,10 +13817,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14001,11 +13936,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v11, v9
@@ -14127,20 +14059,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index f0196fadc4b3fd..464ec088dc297b 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -833,8 +833,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
@@ -911,8 +910,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -978,19 +976,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -1054,17 +1050,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -1898,8 +1892,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
@@ -1976,8 +1969,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -2043,19 +2035,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -2119,17 +2109,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -2963,8 +2951,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
@@ -3041,8 +3028,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -3108,19 +3094,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -3184,17 +3168,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -5002,10 +4984,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v9, 0
@@ -5142,10 +5122,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -5273,11 +5251,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
@@ -5411,21 +5386,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -6452,10 +6426,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
 ; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
 ; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v4, 0
@@ -6543,10 +6515,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
 ; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
 ; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -6625,11 +6595,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
@@ -6732,21 +6699,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -8569,10 +8535,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v9, 0
@@ -8709,10 +8673,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -8840,11 +8802,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
@@ -8978,21 +8937,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index f672c9c6afa22b..26a0e34d18bdbd 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -833,8 +833,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
@@ -911,8 +910,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -978,19 +976,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -1054,17 +1050,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -1898,8 +1892,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
@@ -1976,8 +1969,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -2043,19 +2035,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -2119,17 +2109,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -2963,8 +2951,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
@@ -3041,8 +3028,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -3108,19 +3094,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -3184,17 +3168,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -5002,10 +4984,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1064-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1064-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v9, 0
@@ -5142,10 +5122,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1032-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1032-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -5273,11 +5251,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1164-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1164-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
@@ -5411,21 +5386,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -6452,10 +6426,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
 ; GFX1064-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
 ; GFX1064-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v4, 0
@@ -6543,10 +6515,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
 ; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
 ; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -6625,11 +6595,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1164-DPP-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1164-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
@@ -6732,21 +6699,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1132-DPP-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -8569,10 +8535,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1064-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1064-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v9, 0
@@ -8709,10 +8673,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1032-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1032-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -8840,11 +8802,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1164-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1164-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
@@ -8978,21 +8937,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
 ; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 2165a6ff65e3b5..c158a8007bcc53 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -995,8 +995,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
@@ -1077,8 +1076,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1148,16 +1146,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -1230,16 +1226,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -2345,8 +2340,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
@@ -2427,8 +2421,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2498,16 +2491,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -2580,16 +2571,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -3695,8 +3685,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
@@ -3777,8 +3766,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3848,16 +3836,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -3930,16 +3916,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -4541,8 +4526,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
@@ -4623,8 +4607,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4694,16 +4677,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -4776,16 +4757,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -5890,8 +5870,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
@@ -5972,8 +5951,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6043,16 +6021,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -6125,16 +6101,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -7950,10 +7925,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v9, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v8, 0
@@ -8080,10 +8053,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8201,11 +8172,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v11, v9
@@ -8327,20 +8295,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -9531,10 +9497,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v4, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
@@ -9627,10 +9591,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9714,11 +9676,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v5, v3
@@ -9808,20 +9767,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB12_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -10990,10 +10947,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v4, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
@@ -11086,10 +11041,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -11173,11 +11126,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v5, v3
@@ -11267,20 +11217,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB14_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -11931,10 +11879,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v4, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
@@ -12027,10 +11973,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v4, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -12114,11 +12058,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v5, v3
@@ -12208,20 +12149,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB15_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -14068,10 +14007,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1064-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v9, 0
 ; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v8, 0
@@ -14198,10 +14135,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14319,11 +14254,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v11, v9
@@ -14445,20 +14377,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v8
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:



More information about the llvm-commits mailing list