[llvm] [MachineSink] Consider multiple instructions when sinking into cycle (PR #142641)

via llvm-commits llvm-commits at lists.llvm.org
Tue Jun 3 10:09:37 PDT 2025


https://github.com/joe-rivos created https://github.com/llvm/llvm-project/pull/142641

RFC link: todo

Instructions are grouped into a sink unit called a "chain" and
considered whether it's profitable to sink as a unit by looking at how
many liveins and defs the chain has. If the chain has more defs than
liveins, then sinking it will (almost always) reduce register pressure
by the difference. Multiple chains sharing a livein are considered to
not introduce this livein.

The max register pressure is estimated in the cycle by estimating cycle
liveins and using a reg pressure tracker for each block. Chains are sunk
if they positively affect the critical pressure sets until the set is no
longer above the max.

This approach relies on the fact that values in the preheader will be
live throughout the entire loop. So sinking a chain next to a use will
almost always reduce pressure.

>From 37e23aafd4d64cdddc77cedc13ab3cad763a06ee Mon Sep 17 00:00:00 2001
From: Joseph Faulls <jofa at rivosinc.com>
Date: Tue, 3 Jun 2025 11:13:12 +0000
Subject: [PATCH 1/2] [MachineSink] Add pre-commit test for extended cycle sink
 capability

---
 .../CodeGen/AMDGPU/machine-sink-cycle.mir     | 572 ++++++++++++++++++
 1 file changed, 572 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-cycle.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-cycle.mir
index f718749bff3d3..c3e333baf6080 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-cycle.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-cycle.mir
@@ -1324,3 +1324,575 @@ body:             |
     S_ENDPGM 0
 ...
 
+# Sinking will increase pressure in loop
+---
+name:            test_sink_no_increase
+alignment:       1
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX9-SUNK-LABEL: name: test_sink_no_increase
+  ; GFX9-SUNK: bb.0:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[DEF:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF5:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF6:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF7:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.1:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF8]], [[DEF9]], 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]]
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]]
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.2
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.2:
+  ; GFX9-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.3.entry:
+  ;
+  ; GFX10-SUNK-LABEL: name: test_sink_no_increase
+  ; GFX10-SUNK: bb.0:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF5:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF6:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF7:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.1:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF8]], [[DEF9]], 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]]
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]]
+  ; GFX10-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.2
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.2:
+  ; GFX10-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.3.entry:
+  bb.0:
+    %0:vreg_256 = IMPLICIT_DEF
+    %1:vreg_256 = IMPLICIT_DEF
+    %2:vreg_256 = IMPLICIT_DEF
+    %3:vreg_256 = IMPLICIT_DEF
+    %4:vreg_256 = IMPLICIT_DEF
+    %5:vreg_256 = IMPLICIT_DEF
+    %6:vreg_256 = IMPLICIT_DEF
+    %7:vreg_256 = IMPLICIT_DEF
+    %8:vreg_256 = IMPLICIT_DEF
+    %9:vgpr_32 = IMPLICIT_DEF
+    %10:vgpr_32 = IMPLICIT_DEF
+    %11:vgpr_32 = V_ADD_U32_e64 %9, %10, 0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %6, implicit %7, implicit %8
+    INLINEASM &"", 1, implicit %11
+    S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+    S_BRANCH %bb.5
+
+  bb.5:
+    S_ENDPGM 0
+...
+
+# Sinking %11 and %12 will not decrease rp
+---
+name:            test_sink_no_rp_decrease
+alignment:       1
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX9-SUNK-LABEL: name: test_sink_no_rp_decrease
+  ; GFX9-SUNK: bb.0:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[DEF:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF5:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF6:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF7:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.1:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF8]], [[DEF9]], 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_MUL_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e64 [[V_ADD_U32_e64_]], [[DEF9]], 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]]
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_MUL_U32_U24_e64_]]
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.2
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.2:
+  ; GFX9-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.3.entry:
+  ;
+  ; GFX10-SUNK-LABEL: name: test_sink_no_rp_decrease
+  ; GFX10-SUNK: bb.0:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF5:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF6:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF7:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.1:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF8]], [[DEF9]], 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_MUL_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e64 [[V_ADD_U32_e64_]], [[DEF9]], 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]]
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_MUL_U32_U24_e64_]]
+  ; GFX10-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.2
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.2:
+  ; GFX10-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.3.entry:
+  bb.0:
+    %0:vreg_256 = IMPLICIT_DEF
+    %1:vreg_256 = IMPLICIT_DEF
+    %2:vreg_256 = IMPLICIT_DEF
+    %3:vreg_256 = IMPLICIT_DEF
+    %4:vreg_256 = IMPLICIT_DEF
+    %5:vreg_256 = IMPLICIT_DEF
+    %6:vreg_256 = IMPLICIT_DEF
+    %7:vreg_256 = IMPLICIT_DEF
+    %8:vreg_256 = IMPLICIT_DEF
+    %9:vgpr_32 = IMPLICIT_DEF
+    %10:vgpr_32 = IMPLICIT_DEF
+    %11:vgpr_32 = V_ADD_U32_e64 %9, %10, 0, implicit $exec
+    %12:vgpr_32 = V_MUL_U32_U24_e64 %11, %10, 0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %6, implicit %7, implicit %8
+    INLINEASM &"", 1, implicit %11, implicit %12
+    S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+    S_BRANCH %bb.5
+
+  bb.5:
+    S_ENDPGM 0
+...
+
+
+# Drecreases rp only if %11 and %12 are sunk together
+---
+name:            test_sink_rp_decrease
+alignment:       1
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX9-SUNK-LABEL: name: test_sink_rp_decrease
+  ; GFX9-SUNK: bb.0:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[DEF:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF5:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF6:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF7:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.1:
+  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]]
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.3
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.2:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF8]], [[DEF8]], 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_MUL_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e64 [[V_ADD_U32_e64_]], [[DEF8]], 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_MUL_U32_U24_e64_]]
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.3:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF8]], [[DEF8]], 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_1]]
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.4:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.5
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.5:
+  ; GFX9-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.6.entry:
+  ;
+  ; GFX10-SUNK-LABEL: name: test_sink_rp_decrease
+  ; GFX10-SUNK: bb.0:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF5:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF6:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF7:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.1:
+  ; GFX10-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]]
+  ; GFX10-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.3
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.2:
+  ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF8]], [[DEF8]], 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_MUL_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e64 [[V_ADD_U32_e64_]], [[DEF8]], 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_MUL_U32_U24_e64_]]
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.3:
+  ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF8]], [[DEF8]], 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_1]]
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.4:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.5
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.5:
+  ; GFX10-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.6.entry:
+  bb.0:
+    %0:vreg_256 = IMPLICIT_DEF
+    %1:vreg_256 = IMPLICIT_DEF
+    %2:vreg_256 = IMPLICIT_DEF
+    %3:vreg_256 = IMPLICIT_DEF
+    %4:vreg_256 = IMPLICIT_DEF
+    %5:vreg_256 = IMPLICIT_DEF
+    %6:vreg_256 = IMPLICIT_DEF
+    %7:vreg_256 = IMPLICIT_DEF
+    %8:vreg_256 = IMPLICIT_DEF
+    %9:vgpr_32 = IMPLICIT_DEF
+    %10:vgpr_32 = IMPLICIT_DEF
+    %11:vgpr_32 = V_ADD_U32_e64 %10, %10, 0, implicit $exec
+    %12:vgpr_32 = V_MUL_U32_U24_e64 %11, %10, 0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %6, implicit %7, implicit %8
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.3
+
+  bb.2:
+    INLINEASM &"", 1, implicit %11, implicit %12
+    S_BRANCH %bb.4
+
+  bb.3:
+    INLINEASM &"", 1, implicit %11
+    S_BRANCH %bb.4
+
+  bb.4:
+    S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+    S_BRANCH %bb.5
+
+  bb.5:
+    S_ENDPGM 0
+...
+
+# Prioritise sinking %13 as it's used fewer times
+---
+name:            test_sink_fewest_uses
+alignment:       1
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX9-SUNK-LABEL: name: test_sink_fewest_uses
+  ; GFX9-SUNK: bb.0:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[DEF:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF5:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF6:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF9]], [[DEF9]], 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF10]], [[DEF10]], 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.1:
+  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]], implicit [[DEF9]], implicit [[DEF10]]
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.3
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.2:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]]
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.3:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]]
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.4:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.5
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.5:
+  ; GFX9-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.6.entry:
+  ;
+  ; GFX10-SUNK-LABEL: name: test_sink_fewest_uses
+  ; GFX10-SUNK: bb.0:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF5:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF6:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.1:
+  ; GFX10-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]], implicit [[DEF9]], implicit [[DEF10]]
+  ; GFX10-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.3
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.2:
+  ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF9]], [[DEF9]], 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF10]], [[DEF10]], 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]]
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.3:
+  ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF9]], [[DEF9]], 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_2]]
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.4:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.5
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.5:
+  ; GFX10-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.6.entry:
+  bb.0:
+    %0:vreg_256 = IMPLICIT_DEF
+    %1:vreg_256 = IMPLICIT_DEF
+    %2:vreg_256 = IMPLICIT_DEF
+    %3:vreg_256 = IMPLICIT_DEF
+    %4:vreg_256 = IMPLICIT_DEF
+    %5:vreg_256 = IMPLICIT_DEF
+    %6:vreg_256 = IMPLICIT_DEF
+    %7:vreg_256 = IMPLICIT_DEF
+    %8:vreg_128 = IMPLICIT_DEF
+    %9:vgpr_32 = IMPLICIT_DEF
+    %10:vgpr_32 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vgpr_32 = V_ADD_U32_e64 %10, %10, 0, implicit $exec
+    %13:vgpr_32 = V_ADD_U32_e64 %11, %11, 0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.3
+
+  bb.2:
+    INLINEASM &"", 1, implicit %12, implicit %13
+    S_BRANCH %bb.4
+
+  bb.3:
+    INLINEASM &"", 1, implicit %12
+    S_BRANCH %bb.4
+
+  bb.4:
+    S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+    S_BRANCH %bb.5
+
+  bb.5:
+    S_ENDPGM 0
+...
+
+---
+name:            test_high_rp_inside
+alignment:       1
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX9-SUNK-LABEL: name: test_high_rp_inside
+  ; GFX9-SUNK: bb.0:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[DEF:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF4]], [[DEF5]], 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_MUL_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e64 [[V_ADD_U32_e64_]], [[DEF5]], 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.1:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]]
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, def %8
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, def %9
+  ; GFX9-SUNK-NEXT:   [[COPY:%[0-9]+]]:vreg_256_align2 = COPY %8
+  ; GFX9-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vreg_256_align2 = COPY %9
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_MUL_U32_U24_e64_]], implicit %8, implicit %9, implicit [[COPY]], implicit [[COPY1]], implicit [[DEF4]], implicit [[DEF5]]
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.2
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.2:
+  ; GFX9-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.3.entry:
+  ; GFX9-SUNK-NEXT:   successors:
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.4.entry:
+  ;
+  ; GFX10-SUNK-LABEL: name: test_high_rp_inside
+  ; GFX10-SUNK: bb.0:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.1:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF4]], [[DEF5]], 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_MUL_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e64 [[V_ADD_U32_e64_]], [[DEF5]], 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]]
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, def %8
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, def %9
+  ; GFX10-SUNK-NEXT:   [[COPY:%[0-9]+]]:vreg_256 = COPY %8
+  ; GFX10-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vreg_256 = COPY %9
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_MUL_U32_U24_e64_]], implicit %8, implicit %9, implicit [[COPY]], implicit [[COPY1]], implicit [[DEF4]], implicit [[DEF5]]
+  ; GFX10-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.2
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.2:
+  ; GFX10-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.3.entry:
+  ; GFX10-SUNK-NEXT:   successors:
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.4.entry:
+  bb.0:
+    %0:vreg_256 = IMPLICIT_DEF
+    %1:vreg_256 = IMPLICIT_DEF
+    %2:vreg_256 = IMPLICIT_DEF
+    %3:vreg_256 = IMPLICIT_DEF
+    %4:vgpr_32 = IMPLICIT_DEF
+    %5:vgpr_32 = IMPLICIT_DEF
+    %6:vgpr_32 = V_ADD_U32_e64 %4, %5, 0, implicit $exec
+    %7:vgpr_32 = V_MUL_U32_U24_e64 %6, %5, 0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    INLINEASM &"", 1 /* sideeffect attdialect */, implicit %0, implicit %1, implicit %2, implicit %3
+    INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, def %8:vreg_256
+    INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, def %9:vreg_256
+    %10:vreg_256 = COPY %8
+    %11:vreg_256 = COPY %9
+    INLINEASM &"", 1 /* sideeffect attdialect */, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11, implicit %4, implicit %5
+    S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+
+  bb.3.entry:
+...

>From 3c1900dbdc0cf985c52c13b0188f736322aaf410 Mon Sep 17 00:00:00 2001
From: Joseph Faulls <jofa at rivosinc.com>
Date: Thu, 29 May 2025 10:45:20 +0000
Subject: [PATCH 2/2] [MachineSink] Consider multiple instructions when sinking
 into cycle

Instructions are grouped into a sink unit called a "chain" and
considered whether it's profitable to sink as a unit by looking at how
many liveins and defs the chain has. If the chain has more defs than
liveins, then sinking it will (almost always) reduce register pressure
by the difference. Multiple chains sharing a livein are considered to
not introduce this livein.

The max register pressure is estimated in the cycle by estimating cycle
liveins and using a reg pressure tracker for each block. Chains are sunk
if they positively affect the critical pressure sets until the set is no
longer above the max.

This approach relies on the fact that values in the preheader will be
live throughout the entire loop. So sinking a chain next to a use will
almost always reduce pressure.
---
 llvm/lib/CodeGen/MachineSink.cpp              | 721 +++++++++++++----
 .../CodeGen/AMDGPU/machine-sink-cycle.mir     | 752 ++++++++----------
 .../SystemZ/machinelicm-sunk-kill-flags.mir   |  55 +-
 3 files changed, 917 insertions(+), 611 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index e3f6eda8ff065..417e56a908e6b 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -112,6 +112,17 @@ static cl::opt<unsigned> SinkIntoCycleLimit(
         "The maximum number of instructions considered for cycle sinking."),
     cl::init(50), cl::Hidden);
 
+static cl::opt<unsigned>
+    SinkMaxFanOut("machine-sink-max-fanout",
+                  cl::desc("The maximum fan-out count for a def to consider it "
+                           "for sinking into a cycle"),
+                  cl::init(2), cl::Hidden);
+
+static cl::opt<unsigned> CloneUsesThreshold(
+    "machine-sink-clone-uses-threshold",
+    cl::desc("Maximum number of uses for a sink candidate to be cloned."),
+    cl::init(8), cl::Hidden);
+
 STATISTIC(NumSunk, "Number of machine instructions sunk");
 STATISTIC(NumCycleSunk, "Number of machine instructions sunk into a cycle");
 STATISTIC(NumSplit, "Number of critical edges split");
@@ -216,6 +227,16 @@ class MachineSinking {
   }
 
 private:
+  // Represents a use-def chain of instructions that can be sunk together.
+  struct Chain {
+    SmallVector<MachineInstr *> Instrs;
+    SmallSet<Register, 4> LiveIns;
+    SmallSet<Register, 4> Defs;
+    SmallVector<int, 8> PSetDiff;
+    unsigned TotalLatency;
+    unsigned NumUses;
+  };
+
   bool ProcessBlock(MachineBasicBlock &MBB);
   void ProcessDbgInst(MachineInstr &MI);
   bool isLegalToBreakCriticalEdge(MachineInstr &MI, MachineBasicBlock *From,
@@ -260,9 +281,19 @@ class MachineSinking {
   void FindCycleSinkCandidates(MachineCycle *Cycle, MachineBasicBlock *BB,
                                SmallVectorImpl<MachineInstr *> &Candidates);
 
-  bool
-  aggressivelySinkIntoCycle(MachineCycle *Cycle, MachineInstr &I,
-                            DenseMap<SinkItem, MachineInstr *> &SunkInstrs);
+  MachineInstr *getEarliestSinkPoint(MachineInstr *MI1, MachineInstr *MI2);
+  void FilterChains(const MachineCycle *Cycle, std::vector<Chain> &Chains);
+  void SortIntoUseDefChains(SmallVectorImpl<MachineInstr *> &Instrs,
+                            std::vector<Chain> &Chains,
+                            const DenseSet<Register> &CycleLiveIns);
+  bool SinkChainIntoCycle(MachineCycle *Cycle, Chain &C,
+                          std::vector<int> &OverMaxSetPressure);
+  void CalculateMaxSetPressures(const MachineBasicBlock *MBB,
+                                const DenseSet<Register> &CycleLiveIns,
+                                std::vector<unsigned> &MaxSetPressure);
+  void CalculateMaxSetPressures(const MachineCycle *Cycle,
+                                const DenseSet<Register> &CycleLiveIns,
+                                std::vector<unsigned> &MaxSetPressure);
 
   bool isProfitableToSinkTo(Register Reg, MachineInstr &MI,
                             MachineBasicBlock *MBB,
@@ -284,8 +315,6 @@ class MachineSinking {
   bool registerPressureSetExceedsLimit(unsigned NRegs,
                                        const TargetRegisterClass *RC,
                                        const MachineBasicBlock &MBB);
-
-  bool registerPressureExceedsLimit(const MachineBasicBlock &MBB);
 };
 
 class MachineSinkingLegacy : public MachineFunctionPass {
@@ -715,7 +744,14 @@ void MachineSinking::FindCycleSinkCandidates(
     MachineCycle *Cycle, MachineBasicBlock *BB,
     SmallVectorImpl<MachineInstr *> &Candidates) {
   for (auto &MI : *BB) {
+    if (MI.isDebugInstr())
+      continue;
     LLVM_DEBUG(dbgs() << "CycleSink: Analysing candidate: " << MI);
+    // TODO: support instructions with multiple defs
+    if (MI.getNumDefs() > 1) {
+      LLVM_DEBUG(dbgs() << "CycleSink: not sinking multi-def instruction\n");
+      continue;
+    }
     if (MI.isMetaInstruction()) {
       LLVM_DEBUG(dbgs() << "CycleSink: not sinking meta instruction\n");
       continue;
@@ -741,10 +777,10 @@ void MachineSinking::FindCycleSinkCandidates(
     if (MI.isConvergent())
       continue;
 
-    const MachineOperand &MO = MI.getOperand(0);
-    if (!MO.isReg() || !MO.getReg() || !MO.isDef())
+    const MachineOperand &DefMO = MI.getOperand(0);
+    if (!DefMO.isReg() || !DefMO.getReg() || !DefMO.isDef())
       continue;
-    if (!MRI->hasOneDef(MO.getReg()))
+    if (!MRI->hasOneDef(DefMO.getReg()))
       continue;
 
     LLVM_DEBUG(dbgs() << "CycleSink: Instruction added as candidate.\n");
@@ -752,6 +788,282 @@ void MachineSinking::FindCycleSinkCandidates(
   }
 }
 
+void MachineSinking::SortIntoUseDefChains(
+    SmallVectorImpl<MachineInstr *> &Instrs, std::vector<Chain> &UserChains,
+    const DenseSet<Register> &CycleLiveIns) {
+  DenseMap<Register, MachineInstr *> DefToInstr;
+  DenseMap<Register, unsigned> UseCounts;
+  SmallSet<MachineInstr *, 8> SkipMI;
+
+  // Do not consider defs with high fan-out count, I.E a def with many uses in
+  // the chain. Unrestricting this can lead to huge chains. This is bad for two
+  // reasons. Firstly, one node with a high use-count is likely to have a large
+  // liverange, so sinking it is less beneficial. Secondly, if there is one
+  // instruction in a chain that cannot be sunk, then this prevents the entire
+  // chain to be sunk. Splitting it up allows more granularity to both avoid
+  // this and be better for SinkIntoCycleLimit.
+  for (MachineInstr *MI : Instrs) {
+    for (const MachineOperand &MO : MI->operands()) {
+      if (MO.isReg()) {
+        Register Reg = MO.getReg();
+        if (MO.isDef()) {
+          DefToInstr[Reg] = MI;
+        } else if (MO.isUse()) {
+          UseCounts[Reg]++;
+          if (UseCounts[Reg] > SinkMaxFanOut) {
+            LLVM_DEBUG(dbgs() << "CycleSink: Not considering sinking " << *MI
+                              << ". Uses exceeds max fanout.\n");
+            SkipMI.insert(DefToInstr[Reg]);
+          }
+        }
+      }
+    }
+  }
+
+  DenseMap<Register, unsigned> RegToChain;
+  DenseMap<MachineInstr *, unsigned> InstrToChain;
+  SmallVector<Chain, 8> Chains;
+  // Process each instruction, placing it into a chain based on defs/uses.
+  unsigned NextChainID = 0;
+  for (MachineInstr *MI : Instrs) {
+    if (SkipMI.contains(MI))
+      continue;
+    SmallSet<unsigned, 4> RelatedChains;
+    SmallSet<Register, 4> LiveIns;
+
+    // Check uses to see if this instruction interacts with a chain.
+    for (const MachineOperand &MO : MI->operands()) {
+      if (!MO.isReg())
+        continue;
+      Register Reg = MO.getReg();
+
+      // Skip physical regs as they are known to be cycle invariant.
+      if (!Reg.isVirtual())
+        continue;
+
+      auto It = RegToChain.find(Reg);
+      if (It != RegToChain.end())
+        RelatedChains.insert(It->second);
+      else if (MO.isUse() && !CycleLiveIns.contains(Reg))
+        LiveIns.insert(Reg);
+    }
+
+    unsigned ChainID;
+    if (RelatedChains.empty()) {
+      // No related chain — create a new one.
+      ChainID = NextChainID++;
+      Chains.resize(NextChainID);
+    } else if (RelatedChains.size() == 1) {
+      // Single related chain — join it.
+      ChainID = *RelatedChains.begin();
+    } else {
+      // Multiple related chains — merge together.
+      ChainID = *RelatedChains.begin();
+      Chain &MainChain = Chains[ChainID];
+
+      for (auto It = ++RelatedChains.begin(); It != RelatedChains.end(); ++It) {
+        Chain &Other = Chains[*It];
+        MainChain.Instrs.append(Other.Instrs.begin(), Other.Instrs.end());
+        for (Register R : Other.LiveIns)
+          MainChain.LiveIns.insert(R);
+
+        // Update mappings
+        for (MachineInstr *MovedMI : Other.Instrs)
+          InstrToChain[MovedMI] = ChainID;
+
+        for (const MachineInstr *MovedMI : Other.Instrs) {
+          for (const MachineOperand &MO : MovedMI->operands()) {
+            if (!MO.isReg())
+              continue;
+            Register Reg = MO.getReg();
+            if (Reg && RegToChain.contains(Reg))
+              RegToChain[Reg] = ChainID;
+          }
+        }
+        Other.Instrs.clear();
+        Other.LiveIns.clear();
+      }
+    }
+
+    // Place this instruction into the chosen chain.
+    Chain &C = Chains[ChainID];
+    C.Instrs.push_back(MI);
+    for (Register R : LiveIns)
+      C.LiveIns.insert(R);
+    InstrToChain[MI] = ChainID;
+
+    // Update reg -> chain mappings for all defs.
+    for (const MachineOperand &MO : MI->operands()) {
+      if (MO.isReg() && MO.isDef())
+        RegToChain[MO.getReg()] = ChainID;
+    }
+  }
+
+  // Now calculate defs of each chain by traversing our list of instructions
+  // bottom-up, keeping a list of visited so that we don't count them as an
+  // external use.
+  for (Chain &C : Chains) {
+    SmallPtrSet<MachineInstr *, 4> VisitedMI;
+    unsigned TotalLatency = 0;
+    unsigned NumUses = 0;
+    for (MachineInstr *MI : llvm::reverse(C.Instrs)) {
+      TotalLatency += SchedModel.computeInstrLatency(MI, true);
+      VisitedMI.insert(MI);
+      for (MachineOperand &MO : MI->defs()) {
+        for (MachineInstr &UseMI : MRI->use_instructions(MO.getReg())) {
+          if (VisitedMI.count(&UseMI))
+            continue;
+          NumUses++;
+          C.Defs.insert(MO.getReg());
+        }
+      }
+    }
+    C.TotalLatency = TotalLatency;
+    C.NumUses = NumUses;
+    // No def chains can exist due to earlier passes not cleaning up after
+    // themselves. Ignore them as they will be cleaned up later.
+    if (C.Defs.empty())
+      C.Instrs.clear();
+  }
+
+  // Copy all our working chains into the user-supplied list
+  for (Chain &C : Chains)
+    if (!C.Instrs.empty())
+      UserChains.push_back(C);
+}
+
+/// Helper function to calculate cycle liveins.
+static void calculateCycleLiveIns(const MachineCycle *Cycle,
+                                  DenseSet<Register> &CycleLiveIns) {
+  DenseSet<Register> Defs;
+
+  // Collect defs.
+  for (const MachineBasicBlock *MBB : Cycle->blocks()) {
+    for (const MachineInstr &MI : *MBB) {
+      if (MI.isDebugInstr())
+        continue;
+      for (const MachineOperand &MO : MI.operands()) {
+        if (MO.isReg() && MO.isDef() &&
+            Register::isVirtualRegister(MO.getReg()))
+          Defs.insert(MO.getReg());
+      }
+    }
+  }
+
+  // A register that is used but never defined in the cycle is considered a
+  // live-in.
+  for (const MachineBasicBlock *MBB : Cycle->blocks()) {
+    for (const MachineInstr &MI : *MBB) {
+      if (MI.isDebugInstr())
+        continue;
+      for (const MachineOperand &MO : MI.operands()) {
+        if (MO.isReg() && MO.isUse() &&
+            Register::isVirtualRegister(MO.getReg())) {
+          Register Reg = MO.getReg();
+          if (!Defs.count(Reg)) {
+            CycleLiveIns.insert(Reg);
+          }
+        }
+      }
+    }
+  }
+}
+
+/// If sinking a chain or series of chains introduces fewer liveins than defs,
+/// it is considered profitable. Remove any chains that are unprofitable.
+void MachineSinking::FilterChains(const MachineCycle *Cycle,
+                                  std::vector<Chain> &Chains) {
+  std::vector<Chain> FilteredChains;
+
+  // Record chain live-in frequency.
+  SmallDenseMap<Register, unsigned> RegToLiveInCount;
+  for (Chain &C : Chains) {
+    for (const Register &LiveIn : C.LiveIns) {
+      RegToLiveInCount[LiveIn]++;
+    }
+  }
+  // Add a chain to the list of FilteredChains if it has more defs than new
+  // liveins. Sharing a livein with another chain is considered as not adding a
+  // new livein.
+  for (Chain &C : Chains) {
+    assert(C.Defs.size() > 0 && "Chain with no defs?");
+    // Create a pressure set diff to see how sinking this chain would affect
+    // register pressure.
+    C.PSetDiff = SmallVector<int, 8>(TRI->getNumRegPressureSets(), 0);
+    unsigned NumNewLiveIns = 0;
+    for (const Register &LiveIn : C.LiveIns) {
+      // If exactly one chain has this register as a livein and it is not
+      // already live in the cycle, consider this as adding a new livein if
+      // sunk.
+      if (RegToLiveInCount[LiveIn] == 1) {
+        NumNewLiveIns++;
+        // Increase register pressure for this new livein.
+        const auto *RC = MRI->getRegClass(LiveIn);
+        const int *PSetID = TRI->getRegClassPressureSets(RC);
+        for (; *PSetID != -1; ++PSetID)
+          C.PSetDiff[*PSetID] += TRI->getRegClassWeight(RC).RegWeight;
+      }
+    }
+    for (const Register &Def : C.Defs) {
+      // Decrease register pressure for this def.
+      const auto *RC = MRI->getRegClass(Def);
+      const int *PSetID = TRI->getRegClassPressureSets(RC);
+      for (; *PSetID != -1; ++PSetID)
+        C.PSetDiff[*PSetID] -= TRI->getRegClassWeight(RC).RegWeight;
+    }
+    if (C.Defs.size() > NumNewLiveIns)
+      FilteredChains.push_back(C);
+  }
+  Chains = FilteredChains;
+}
+
+void MachineSinking::CalculateMaxSetPressures(
+    const MachineBasicBlock *MBB, const DenseSet<Register> &LiveIns,
+    std::vector<unsigned> &MaxSetPressure) {
+  assert(MaxSetPressure.size() == TRI->getNumRegPressureSets());
+  // Create a list of live regs to pre-populate the RPTracker.
+  SmallVector<VRegMaskOrUnit, 8> LiveInRegs;
+  // TODO: Track lanes of liveins
+  for (unsigned Reg : LiveIns)
+    LiveInRegs.emplace_back(Reg, LaneBitmask::getAll());
+  RegionPressure Pressure;
+  RegPressureTracker RPTracker(Pressure);
+  RPTracker.init(MBB->getParent(), &RegClassInfo, nullptr, MBB, MBB->end(),
+                 /*TrackLaneMasks*/ false, /*TrackUntiedDefs=*/true);
+  RPTracker.addLiveRegs(LiveInRegs);
+  for (auto MII = MBB->instr_end(), MIE = MBB->instr_begin(); MII != MIE;
+       --MII) {
+    const MachineInstr &MI = *std::prev(MII);
+    if (MI.isDebugInstr() || MI.isPseudoProbe())
+      continue;
+    RegisterOperands RegOpers;
+    RegOpers.collect(MI, *TRI, *MRI, false, false);
+    RPTracker.recedeSkipDebugValues();
+    assert(&*RPTracker.getPos() == &MI && "RPTracker sync error!");
+    RPTracker.recede(RegOpers);
+  }
+  RPTracker.closeRegion();
+  for (unsigned I = 0; I < TRI->getNumRegPressureSets(); I++)
+    MaxSetPressure[I] =
+        std::max(MaxSetPressure[I], RPTracker.getPressure().MaxSetPressure[I]);
+}
+
+/// Estimate max register pressure in the MachineCycle by traversing each
+/// MachineBasicBlock with a RegPressureTracker, prepopulating the tracker with
+/// pre-calculated LiveIns.
+void MachineSinking::CalculateMaxSetPressures(
+    const MachineCycle *Cycle, const DenseSet<Register> &CycleLiveIns,
+    std::vector<unsigned> &MaxSetPressure) {
+
+  assert(MaxSetPressure.size() == TRI->getNumRegPressureSets());
+  for (const MachineBasicBlock *MBB : Cycle->blocks())
+    CalculateMaxSetPressures(MBB, CycleLiveIns, MaxSetPressure);
+
+  const DenseSet<Register> NoLiveIns;
+  CalculateMaxSetPressures(Cycle->getCyclePreheader(), NoLiveIns,
+                           MaxSetPressure);
+}
+
 PreservedAnalyses
 MachineSinkingPass::run(MachineFunction &MF,
                         MachineFunctionAnalysisManager &MFAM) {
@@ -873,63 +1185,77 @@ bool MachineSinking::run(MachineFunction &MF) {
   }
 
   if (SinkInstsIntoCycle) {
-    SmallVector<MachineCycle *, 8> Cycles(CI->toplevel_cycles());
     SchedModel.init(STI);
-    bool HasHighPressure;
-
-    DenseMap<SinkItem, MachineInstr *> SunkInstrs;
-
-    enum CycleSinkStage { COPY, LOW_LATENCY, AGGRESSIVE, END };
-    for (unsigned Stage = CycleSinkStage::COPY; Stage != CycleSinkStage::END;
-         ++Stage, SunkInstrs.clear()) {
-      HasHighPressure = false;
-
-      for (auto *Cycle : Cycles) {
-        MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
-        if (!Preheader) {
-          LLVM_DEBUG(dbgs() << "CycleSink: Can't find preheader\n");
-          continue;
+    SmallVector<MachineCycle *, 8> Cycles(CI->toplevel_begin(),
+                                          CI->toplevel_end());
+    for (auto *Cycle : Cycles) {
+      MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
+      if (!Preheader) {
+        LLVM_DEBUG(dbgs() << "CycleSink: Can't find preheader\n");
+        continue;
+      }
+      SmallVector<MachineInstr *, 8> Candidates;
+      FindCycleSinkCandidates(Cycle, Preheader, Candidates);
+
+      // Pre-calculate liveins to the cycle. This is used to calculate the max
+      // reg pressure in a loop, and determining how many liveins sinking a
+      // chain would introduce.
+      DenseSet<Register> CycleLiveIns;
+      calculateCycleLiveIns(Cycle, CycleLiveIns);
+
+      // Initalise MaxPressureSet accross the cycle
+      std::vector<unsigned> CycleMaxSetPressure;
+      for (unsigned I = 0; I < TRI->getNumRegPressureSets(); I++)
+        CycleMaxSetPressure.push_back(0);
+      CalculateMaxSetPressures(Cycle, CycleLiveIns, CycleMaxSetPressure);
+
+      std::vector<Chain> Chains;
+      SortIntoUseDefChains(Candidates, Chains, CycleLiveIns);
+      FilterChains(Cycle, Chains);
+
+      // Calculate how much over the max set pressure we are for each set.
+      std::vector<int> OverMaxSetPressure;
+      for (unsigned I = 0; I < TRI->getNumRegPressureSets(); I++) {
+        unsigned Limit = TRI->getRegPressureSetLimit(MF, I);
+        unsigned MaxPressure = CycleMaxSetPressure[I];
+        OverMaxSetPressure.push_back(((int)MaxPressure) - ((int)Limit));
+        LLVM_DEBUG(dbgs() << "CycleSink: Pressure set " << I << ". Limit: "
+                          << Limit << ". MaxPressure: " << MaxPressure << "\n");
+      }
+      // Prioritise chains that introduce the smallest latency in the cycle.
+      llvm::sort(Chains, [](const Chain &A, const Chain &B) {
+        return (A.TotalLatency * A.NumUses) < (B.TotalLatency * B.NumUses);
+      });
+      // Helper function to determine if sinking a chain will reduce a PSet
+      // that is over the max.
+      auto ReducesCritialPSet = [&](Chain &C) {
+        for (unsigned PSetID = 0; PSetID < TRI->getNumRegPressureSets();
+             PSetID++) {
+          if (OverMaxSetPressure[PSetID] > 0 && C.PSetDiff[PSetID] < 0)
+            return true;
         }
-        SmallVector<MachineInstr *, 8> Candidates;
-        FindCycleSinkCandidates(Cycle, Preheader, Candidates);
-
-        unsigned i = 0;
-
-        // Walk the candidates in reverse order so that we start with the use
-        // of a def-use chain, if there is any.
-        // TODO: Sort the candidates using a cost-model.
-        for (MachineInstr *I : llvm::reverse(Candidates)) {
-          // CycleSinkStage::COPY: Sink a limited number of copies
-          if (Stage == CycleSinkStage::COPY) {
-            if (i++ == SinkIntoCycleLimit) {
-              LLVM_DEBUG(dbgs()
-                         << "CycleSink:   Limit reached of instructions to "
-                            "be analyzed.");
-              break;
-            }
-
-            if (!I->isCopy())
-              continue;
-          }
-
-          // CycleSinkStage::LOW_LATENCY: sink unlimited number of instructions
-          // which the target specifies as low-latency
-          if (Stage == CycleSinkStage::LOW_LATENCY &&
-              !TII->hasLowDefLatency(SchedModel, *I, 0))
-            continue;
-
-          if (!aggressivelySinkIntoCycle(Cycle, *I, SunkInstrs))
-            continue;
+        return false;
+      };
+      auto IsCopyOrLowLatency = [&](const Chain &C) {
+        for (const MachineInstr *MI : C.Instrs)
+          if (!MI->isCopy() && !TII->hasLowDefLatency(SchedModel, *MI, 0))
+            return false;
+        return true;
+      };
+      // Sink the chains if they reduce a critical pressure set.
+      for (auto &C : Chains) {
+        // If the instruction is low latency, sink anyway.
+        if (!IsCopyOrLowLatency(C) && !ReducesCritialPSet(C))
+          continue;
+        if (NumCycleSunk + C.Instrs.size() >= SinkIntoCycleLimit) {
+          LLVM_DEBUG(dbgs() << "CycleSink:   Not sinking Chain as will exceed "
+                               "Instruction Sink limit");
+          break;
+        } else if (SinkChainIntoCycle(Cycle, C, OverMaxSetPressure)) {
+          NumCycleSunk += C.Instrs.size();
           EverMadeChange = true;
-          ++NumCycleSunk;
         }
-
-        // Recalculate the pressure after sinking
-        if (!HasHighPressure)
-          HasHighPressure = registerPressureExceedsLimit(*Preheader);
       }
-      if (!HasHighPressure)
-        break;
     }
   }
 
@@ -1242,21 +1568,6 @@ bool MachineSinking::registerPressureSetExceedsLimit(
   return false;
 }
 
-// Recalculate RP and check if any pressure set exceeds the set limit.
-bool MachineSinking::registerPressureExceedsLimit(
-    const MachineBasicBlock &MBB) {
-  std::vector<unsigned> BBRegisterPressure = getBBRegisterPressure(MBB, false);
-
-  for (unsigned PS = 0; PS < BBRegisterPressure.size(); ++PS) {
-    if (BBRegisterPressure[PS] >=
-        TRI->getRegPressureSetLimit(*MBB.getParent(), PS)) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
 /// isProfitableToSinkTo - Return true if it is profitable to sink MI.
 bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI,
                                           MachineBasicBlock *MBB,
@@ -1735,97 +2046,203 @@ bool MachineSinking::hasStoreBetween(MachineBasicBlock *From,
   return HasAliasedStore;
 }
 
-/// Aggressively sink instructions into cycles. This will aggressively try to
-/// sink all instructions in the top-most preheaders in an attempt to reduce RP.
-/// In particular, it will sink into multiple successor blocks without limits
-/// based on the amount of sinking, or the type of ops being sunk (so long as
-/// they are safe to sink).
-bool MachineSinking::aggressivelySinkIntoCycle(
-    MachineCycle *Cycle, MachineInstr &I,
-    DenseMap<SinkItem, MachineInstr *> &SunkInstrs) {
-  // TODO: support instructions with multiple defs
-  if (I.getNumDefs() > 1)
-    return false;
-
-  LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Finding sink block for: " << I);
-  assert(Cycle->getCyclePreheader() && "Cycle sink needs a preheader block");
-  SmallVector<std::pair<RegSubRegPair, MachineInstr *>> Uses;
-
-  MachineOperand &DefMO = I.getOperand(0);
-  for (MachineInstr &MI : MRI->use_instructions(DefMO.getReg())) {
-    Uses.push_back({{DefMO.getReg(), DefMO.getSubReg()}, &MI});
-  }
-
-  for (std::pair<RegSubRegPair, MachineInstr *> Entry : Uses) {
-    MachineInstr *MI = Entry.second;
-    LLVM_DEBUG(dbgs() << "AggressiveCycleSink:   Analysing use: " << MI);
-    if (MI->isPHI()) {
-      LLVM_DEBUG(
-          dbgs() << "AggressiveCycleSink:   Not attempting to sink for PHI.\n");
-      continue;
-    }
-    // We cannot sink before the prologue
-    if (MI->isPosition() || TII->isBasicBlockPrologue(*MI)) {
-      LLVM_DEBUG(dbgs() << "AggressiveCycleSink:   Use is BasicBlock prologue, "
-                           "can't sink.\n");
-      continue;
+MachineInstr *MachineSinking::getEarliestSinkPoint(MachineInstr *MI1,
+                                                   MachineInstr *MI2) {
+  MachineBasicBlock *MBB1 = MI1->getParent();
+  MachineBasicBlock *MBB2 = MI2->getParent();
+  if (MBB1 != MBB2) {
+    // If there are two uses in different blocks, set sink point to common
+    // dominator.
+    MachineBasicBlock *SinkBlock = DT->findNearestCommonDominator(MBB1, MBB2);
+    if (!SinkBlock) {
+      LLVM_DEBUG(dbgs() << "CycleSink:   Can't find nearest dominator\n");
+      return nullptr;
     }
-    if (!Cycle->contains(MI->getParent())) {
-      LLVM_DEBUG(
-          dbgs() << "AggressiveCycleSink:   Use not in cycle, can't sink.\n");
-      continue;
+    if (SinkBlock == MBB1)
+      return MI1;
+    else if (SinkBlock == MBB2)
+      return MI2;
+    else
+      return &*SinkBlock->SkipPHIsAndLabels(SinkBlock->begin());
+  } else {
+    // If two uses have the same basic block, set sink point to whichever
+    // instruction appears first.
+    for (const MachineInstr &MI : *MBB1) {
+      if (&MI == MI1) {
+        return MI1;
+      } else if (&MI == MI2) {
+        return MI2;
+      }
     }
+  }
+  return nullptr;
+}
 
-    MachineBasicBlock *SinkBlock = MI->getParent();
-    MachineInstr *NewMI = nullptr;
-    SinkItem MapEntry(&I, SinkBlock);
+/// Sink instructions into cycles if profitable. This especially tries to
+/// prevent register spills caused by register pressure if there is little to no
+/// overhead moving instructions into cycles.
+bool MachineSinking::SinkChainIntoCycle(MachineCycle *Cycle, Chain &C,
+                                        std::vector<int> &OverMaxSetPressure) {
+  LLVM_DEBUG(dbgs() << "CycleSink: Finding sink block for Chain:\n";
+             for (auto *MI : C.Instrs) dbgs() << *MI << "\n";);
+  MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
+  assert(Preheader && "Cycle sink needs a preheader block");
+  DenseMap<MachineInstr *, MachineInstr *> SinkLocations;
+  SmallPtrSet<MachineInstr *, 4> VisitedMI;
+
+  // First, check all instructions in the Chain can be sunk, recording their
+  // sink location. Go in reverse order to account for uses within the chain.
+  for (MachineInstr *I : llvm::reverse(C.Instrs)) {
+    VisitedMI.insert(I);
+    MachineInstr *SinkToMI = nullptr;
+    MachineOperand &DefMO = I->getOperand(0);
+    for (MachineInstr &MI : MRI->use_instructions(DefMO.getReg())) {
+      if (VisitedMI.contains(&MI)) {
+        // Use is in the chain. Find the earliest sink point between our
+        // current sink point, if we have one, and the use's sink point.
+        if (SinkToMI) {
+          SinkToMI = getEarliestSinkPoint(SinkToMI, SinkLocations[&MI]);
+          if (!SinkToMI) {
+            LLVM_DEBUG(dbgs()
+                       << "CycleSink:   Could not get earliest sink point of "
+                       << SinkToMI << " and " << SinkLocations[&MI]);
+            return false;
+          }
+        } else {
+          SinkToMI = SinkLocations[&MI];
+        }
+        continue;
+      }
+      LLVM_DEBUG(dbgs() << "CycleSink:   Analysing use: " << MI);
 
-    auto SI = SunkInstrs.find(MapEntry);
+      if (!Cycle->contains(MI.getParent())) {
+        LLVM_DEBUG(dbgs() << "CycleSink:   Use not in cycle, can't sink.\n");
+        return false;
+      }
 
-    // Check for the case in which we have already sunk a copy of this
-    // instruction into the user block.
-    if (SI != SunkInstrs.end()) {
-      LLVM_DEBUG(dbgs() << "AggressiveCycleSink:   Already sunk to block: "
-                        << printMBBReference(*SinkBlock) << "\n");
-      NewMI = SI->second;
+      if (MI.isPHI())
+        return false;
+
+      if (MI.getParent() == Cycle->getCyclePreheader()) {
+        LLVM_DEBUG(
+            dbgs() << "CycleSink: Not sinking, sink block is the preheader\n");
+        return false;
+      }
+
+      if (!SinkToMI) {
+        SinkToMI = &MI;
+        LLVM_DEBUG(dbgs() << "CycleSink:   Setting sink MI to: " << MI << "\n");
+        continue;
+      }
+      SinkToMI = getEarliestSinkPoint(&MI, SinkToMI);
+      if (!SinkToMI)
+        return false;
     }
+    LLVM_DEBUG(dbgs() << "CycleSink:   Setting sink location  for " << *I
+                      << "to : " << *SinkToMI << "\n");
+    SinkLocations[I] = SinkToMI;
+  }
 
-    // Create a copy of the instruction in the use block.
-    if (!NewMI) {
-      LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Sinking instruction to block: "
-                        << printMBBReference(*SinkBlock) << "\n");
-
-      NewMI = I.getMF()->CloneMachineInstr(&I);
-      if (DefMO.getReg().isVirtual()) {
-        const TargetRegisterClass *TRC = MRI->getRegClass(DefMO.getReg());
-        Register DestReg = MRI->createVirtualRegister(TRC);
-        NewMI->substituteRegister(DefMO.getReg(), DestReg, DefMO.getSubReg(),
-                                  *TRI);
+  // Sink/clone all the instructions. Going in reverse because uses within the
+  // chain need to be cloned before defs to easily update uses.
+  // However, this does make it difficult when multiple instructions have the
+  // same sink point, as they will be sunk in reverse order, reversing the
+  // order of the instructions. Account for this by keeping track of how many
+  // times an instruction has been sunk to a location.
+  SmallPtrSet<MachineInstr *, 4> ToDelete;
+  SmallPtrSet<MachineInstr *, 4> ToClone;
+  SmallDenseMap<MachineInstr *, unsigned> NumSunkToSinkPoint;
+  SmallDenseMap<MachineBasicBlock *, MachineInstr *> EarliestClonePointInBlock;
+  for (MachineInstr *I : llvm::reverse(C.Instrs)) {
+    auto SinkLocation = SinkLocations[I];
+
+    bool CloneUses = true;
+    SmallDenseMap<MachineBasicBlock *, MachineInstr *> FirstUseInBlock;
+    // Set of users will act as a worklist when it comes to cloning.
+    SmallPtrSet<MachineInstr *, 4> UserMIs;
+    MachineOperand &DefMO = I->getOperand(0);
+    // If the number of uses from the def exceeds a threshold, determined
+    // by opt param CloneUsesThreshold, don't clone as the spiller is much
+    // more likely to prioritise other values to spill.
+    if (C.NumUses > CloneUsesThreshold) {
+      LLVM_DEBUG(dbgs() << "CycleSink: Chain with too many uses (" << C.NumUses
+                        << "). Not considering cloning.\n");
+      CloneUses = false;
+      continue;
+    }
+    // Add all users to a worklist, recording the first user in each block as
+    // to limit cloning to once per block to avoid excessive cloning.
+    for (MachineInstr &UserMI : MRI->use_instructions(DefMO.getReg())) {
+      MachineBasicBlock *UserBlock = UserMI.getParent();
+      UserMIs.insert(&UserMI);
+      auto It = FirstUseInBlock.find(UserBlock);
+      MachineInstr *SinkPoint = It == FirstUseInBlock.end()
+                                    ? &UserMI
+                                    : getEarliestSinkPoint(It->second, &UserMI);
+      FirstUseInBlock[UserBlock] = SinkPoint;
+    }
+    // Sink if there's only one clone point
+    if (!CloneUses || FirstUseInBlock.size() == 1) {
+      LLVM_DEBUG(dbgs() << "CycleSink: Sinking instruction " << *I << "\n");
+      // Adjust the sink location due to iterating in reverse.
+      auto It = NumSunkToSinkPoint.find(SinkLocation);
+      if (It != NumSunkToSinkPoint.end()) {
+        auto BlockIt = MachineBasicBlock::iterator(SinkLocation);
+        unsigned Step = It->second;
+        while (Step--)
+          --BlockIt;
+        NumSunkToSinkPoint[SinkLocation] = It->second + 1;
+        SinkLocation = &*BlockIt;
+      } else {
+        NumSunkToSinkPoint[SinkLocation] = 1;
+      }
+      SinkLocation->getParent()->splice(SinkLocation, Preheader, I);
+    } else {
+      LLVM_DEBUG(dbgs() << "CycleSink: Cloning instruction " << *I << "\n");
+      for (MachineInstr *UserMI : UserMIs) {
+        MachineBasicBlock *UserMBB = UserMI->getParent();
+        // Only clone to the earliest user in the block.
+        if (FirstUseInBlock[UserMBB] != UserMI || ToDelete.contains(UserMI))
+          continue;
+        // Actually do the cloning, keeping track of the new registers.
+        MachineInstr *Clone = I->getMF()->CloneMachineInstr(I);
+        Register DefReg = I->getOperand(0).getReg();
+        Register NewVReg = MRI->createVirtualRegister(MRI->getRegClass(DefReg));
+        Clone->getOperand(0).setReg(NewVReg);
+        UserMBB->insert(UserMI, Clone);
+        ToDelete.insert(I);
+        // Update registers for all users within this block.
+        for (MachineInstr *UserMI2 : UserMIs)
+          if (UserMI2->getParent() == UserMBB)
+            for (MachineOperand &MO : UserMI2->operands())
+              if (MO.isReg() && MO.getReg() == DefReg)
+                MO.setReg(NewVReg);
+        // Adjust future instructions that want to sink to this location.
+        // Account for the fact we may be cloning to an already cloned or
+        // sunk MI, in which case find the original sink point and increment.
+        auto UserSinkLoc = SinkLocations.find(UserMI);
+        MachineInstr *OriginalSinkLoc =
+            UserSinkLoc != SinkLocations.end() ? UserSinkLoc->second : UserMI;
+        auto It = NumSunkToSinkPoint.find(OriginalSinkLoc);
+        NumSunkToSinkPoint[OriginalSinkLoc] =
+            It == NumSunkToSinkPoint.end() ? 1 : It->second + 1;
+        SinkLocations[Clone] = OriginalSinkLoc;
       }
-      SinkBlock->insert(SinkBlock->SkipPHIsAndLabels(SinkBlock->begin()),
-                        NewMI);
-      SunkInstrs.insert({MapEntry, NewMI});
     }
 
     // Conservatively clear any kill flags on uses of sunk instruction
-    for (MachineOperand &MO : NewMI->all_uses()) {
-      assert(MO.isReg() && MO.isUse());
-      RegsToClearKillFlags.insert(MO.getReg());
+    for (MachineOperand &MO : I->operands()) {
+      if (MO.isReg() && MO.readsReg())
+        RegsToClearKillFlags.insert(MO.getReg());
     }
+  }
 
-    // The instruction is moved from its basic block, so do not retain the
-    // debug information.
-    assert(!NewMI->isDebugInstr() && "Should not sink debug inst");
-    NewMI->setDebugLoc(DebugLoc());
+  // Adjust max set pressure
+  for (unsigned PSetID = 0; PSetID < TRI->getNumRegPressureSets(); PSetID++)
+    OverMaxSetPressure[PSetID] += C.PSetDiff[PSetID];
 
-    // Replace the use with the newly created virtual register.
-    RegSubRegPair &UseReg = Entry.first;
-    MI->substituteRegister(UseReg.Reg, NewMI->getOperand(0).getReg(),
-                           UseReg.SubReg, *TRI);
-  }
-  // If we have replaced all uses, then delete the dead instruction
-  if (I.isDead(*MRI))
-    I.eraseFromParent();
+  for (MachineInstr *MI : ToDelete)
+    MI->eraseFromParent();
   return true;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-cycle.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-cycle.mir
index c3e333baf6080..c8932d8c258e5 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-cycle.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-cycle.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 --sink-insts-to-avoid-spills=1 --stop-after=machine-sink -o -  %s | FileCheck -check-prefixes=GFX9-SUNK %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 --sink-insts-to-avoid-spills=1 -mattr=+wavefrontsize64 --stop-after=machine-sink -o -  %s | FileCheck -check-prefixes=GFX10-SUNK %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 --sink-insts-to-avoid-spills=1 -machine-sink-cycle-limit=10000 -mattr=+wavefrontsize64 --stop-after=machine-sink -o -  %s | FileCheck -check-prefixes=GFX10-SUNK %s
 
 ---
 name:            test_sink_copy
@@ -29,24 +29,24 @@ body:             |
   ; GFX9-SUNK-NEXT: bb.2:
   ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
   ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
   ; GFX9-SUNK-NEXT:   [[COPY:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
   ; GFX9-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
   ; GFX9-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
   ; GFX9-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
   ; GFX9-SUNK-NEXT:   [[COPY4:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
-  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
   ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY]], implicit [[COPY1]], implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY4]]
   ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
   ; GFX9-SUNK-NEXT: {{  $}}
   ; GFX9-SUNK-NEXT: bb.3:
   ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
   ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
   ; GFX9-SUNK-NEXT:   [[COPY5:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
   ; GFX9-SUNK-NEXT:   [[COPY6:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
   ; GFX9-SUNK-NEXT:   [[COPY7:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
   ; GFX9-SUNK-NEXT:   [[COPY8:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
   ; GFX9-SUNK-NEXT:   [[COPY9:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
-  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
   ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY5]], implicit [[COPY6]], implicit [[COPY7]], implicit [[COPY8]], implicit [[COPY9]]
   ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
   ; GFX9-SUNK-NEXT: {{  $}}
@@ -81,24 +81,24 @@ body:             |
   ; GFX10-SUNK-NEXT: bb.2:
   ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
   ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
   ; GFX10-SUNK-NEXT:   [[COPY:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
   ; GFX10-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
   ; GFX10-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
   ; GFX10-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
   ; GFX10-SUNK-NEXT:   [[COPY4:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
-  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
   ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY]], implicit [[COPY1]], implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY4]]
   ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
   ; GFX10-SUNK-NEXT: {{  $}}
   ; GFX10-SUNK-NEXT: bb.3:
   ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
   ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
   ; GFX10-SUNK-NEXT:   [[COPY5:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
   ; GFX10-SUNK-NEXT:   [[COPY6:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
   ; GFX10-SUNK-NEXT:   [[COPY7:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
   ; GFX10-SUNK-NEXT:   [[COPY8:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
   ; GFX10-SUNK-NEXT:   [[COPY9:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
-  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
   ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY5]], implicit [[COPY6]], implicit [[COPY7]], implicit [[COPY8]], implicit [[COPY9]]
   ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
   ; GFX10-SUNK-NEXT: {{  $}}
@@ -183,12 +183,12 @@ body:             |
   ; GFX9-SUNK-NEXT: bb.2:
   ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
   ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
   ; GFX9-SUNK-NEXT:   [[COPY:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
   ; GFX9-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
   ; GFX9-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
   ; GFX9-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
   ; GFX9-SUNK-NEXT:   [[COPY4:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
-  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
   ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY]], implicit [[COPY1]], implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY4]]
   ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]]
   ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
@@ -196,12 +196,12 @@ body:             |
   ; GFX9-SUNK-NEXT: bb.3:
   ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
   ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
   ; GFX9-SUNK-NEXT:   [[COPY5:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
   ; GFX9-SUNK-NEXT:   [[COPY6:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
   ; GFX9-SUNK-NEXT:   [[COPY7:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
   ; GFX9-SUNK-NEXT:   [[COPY8:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
   ; GFX9-SUNK-NEXT:   [[COPY9:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
-  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
   ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY5]], implicit [[COPY6]], implicit [[COPY7]], implicit [[COPY8]], implicit [[COPY9]]
   ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]]
   ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
@@ -237,36 +237,36 @@ body:             |
   ; GFX10-SUNK-NEXT: bb.2:
   ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
   ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
   ; GFX10-SUNK-NEXT:   [[COPY:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
   ; GFX10-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
   ; GFX10-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
   ; GFX10-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
   ; GFX10-SUNK-NEXT:   [[COPY4:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
-  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
   ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY]], implicit [[COPY1]], implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY4]]
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
   ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]]
   ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
   ; GFX10-SUNK-NEXT: {{  $}}
   ; GFX10-SUNK-NEXT: bb.3:
   ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
   ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
   ; GFX10-SUNK-NEXT:   [[COPY5:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
   ; GFX10-SUNK-NEXT:   [[COPY6:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
   ; GFX10-SUNK-NEXT:   [[COPY7:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
   ; GFX10-SUNK-NEXT:   [[COPY8:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
   ; GFX10-SUNK-NEXT:   [[COPY9:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
-  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
   ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY5]], implicit [[COPY6]], implicit [[COPY7]], implicit [[COPY8]], implicit [[COPY9]]
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
   ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]]
   ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
   ; GFX10-SUNK-NEXT: {{  $}}
@@ -412,57 +412,57 @@ body:             |
   ; GFX10-SUNK-NEXT: bb.2:
   ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
   ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
   ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
-  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]]
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_1]]
   ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
   ; GFX10-SUNK-NEXT: {{  $}}
   ; GFX10-SUNK-NEXT: bb.3:
   ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
   ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
   ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
-  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]], implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_28]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_31]], implicit [[V_ADD_U32_e64_32]], implicit [[V_ADD_U32_e64_33]], implicit [[V_ADD_U32_e64_34]], implicit [[V_ADD_U32_e64_35]], implicit [[V_ADD_U32_e64_36]], implicit [[V_ADD_U32_e64_37]], implicit [[V_ADD_U32_e64_38]], implicit [[V_ADD_U32_e64_39]], implicit [[V_ADD_U32_e64_40]], implicit [[V_ADD_U32_e64_41]]
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_32]], implicit [[V_ADD_U32_e64_41]], implicit [[V_ADD_U32_e64_40]], implicit [[V_ADD_U32_e64_39]], implicit [[V_ADD_U32_e64_38]], implicit [[V_ADD_U32_e64_37]], implicit [[V_ADD_U32_e64_36]], implicit [[V_ADD_U32_e64_35]], implicit [[V_ADD_U32_e64_34]], implicit [[V_ADD_U32_e64_33]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_31]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_28]], implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_26]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_22]]
   ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
   ; GFX10-SUNK-NEXT: {{  $}}
   ; GFX10-SUNK-NEXT: bb.4:
@@ -543,17 +543,6 @@ body:             |
   ; GFX9-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
   ; GFX9-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
   ; GFX9-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.1:
-  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.3
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.2:
-  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
-  ; GFX9-SUNK-NEXT: {{  $}}
   ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
   ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
   ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
@@ -567,56 +556,43 @@ body:             |
   ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
   ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
   ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.1:
+  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.3
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.2:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
   ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
-  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]]
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_26]], implicit [[V_ADD_U32_e64_25]]
   ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
   ; GFX9-SUNK-NEXT: {{  $}}
   ; GFX9-SUNK-NEXT: bb.3:
   ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
   ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_42:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_43:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_44:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_45:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_46:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_47:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_48:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_49:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_50:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_51:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_52:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_53:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
   ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
-  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_28]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_31]], implicit [[V_ADD_U32_e64_32]], implicit [[V_ADD_U32_e64_33]], implicit [[V_ADD_U32_e64_34]], implicit [[V_ADD_U32_e64_35]], implicit [[V_ADD_U32_e64_36]], implicit [[V_ADD_U32_e64_37]], implicit [[V_ADD_U32_e64_38]], implicit [[V_ADD_U32_e64_39]], implicit [[V_ADD_U32_e64_40]], implicit [[V_ADD_U32_e64_41]], implicit [[V_ADD_U32_e64_42]], implicit [[V_ADD_U32_e64_43]], implicit [[V_ADD_U32_e64_44]], implicit [[V_ADD_U32_e64_45]], implicit [[V_ADD_U32_e64_46]], implicit [[V_ADD_U32_e64_47]], implicit [[V_ADD_U32_e64_48]], implicit [[V_ADD_U32_e64_49]], implicit [[V_ADD_U32_e64_50]], implicit [[V_ADD_U32_e64_51]], implicit [[V_ADD_U32_e64_52]], implicit [[V_ADD_U32_e64_53]]
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_28]]
   ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
   ; GFX9-SUNK-NEXT: {{  $}}
   ; GFX9-SUNK-NEXT: bb.4:
@@ -650,69 +626,69 @@ body:             |
   ; GFX10-SUNK-NEXT: bb.2:
   ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
   ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
   ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
-  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]]
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_26]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_1]]
   ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
   ; GFX10-SUNK-NEXT: {{  $}}
   ; GFX10-SUNK-NEXT: bb.3:
   ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
   ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_42:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_43:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_44:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_45:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_46:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_47:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_48:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_49:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_50:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_51:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_52:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_53:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
   ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
-  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_28]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_31]], implicit [[V_ADD_U32_e64_32]], implicit [[V_ADD_U32_e64_33]], implicit [[V_ADD_U32_e64_34]], implicit [[V_ADD_U32_e64_35]], implicit [[V_ADD_U32_e64_36]], implicit [[V_ADD_U32_e64_37]], implicit [[V_ADD_U32_e64_38]], implicit [[V_ADD_U32_e64_39]], implicit [[V_ADD_U32_e64_40]], implicit [[V_ADD_U32_e64_41]], implicit [[V_ADD_U32_e64_42]], implicit [[V_ADD_U32_e64_43]], implicit [[V_ADD_U32_e64_44]], implicit [[V_ADD_U32_e64_45]], implicit [[V_ADD_U32_e64_46]], implicit [[V_ADD_U32_e64_47]], implicit [[V_ADD_U32_e64_48]], implicit [[V_ADD_U32_e64_49]], implicit [[V_ADD_U32_e64_50]], implicit [[V_ADD_U32_e64_51]], implicit [[V_ADD_U32_e64_52]], implicit [[V_ADD_U32_e64_53]]
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_42:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_43:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_44:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_45:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_46:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_47:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_48:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_49:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_50:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_51:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_52:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_53:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_41]], implicit [[V_ADD_U32_e64_53]], implicit [[V_ADD_U32_e64_52]], implicit [[V_ADD_U32_e64_51]], implicit [[V_ADD_U32_e64_50]], implicit [[V_ADD_U32_e64_49]], implicit [[V_ADD_U32_e64_48]], implicit [[V_ADD_U32_e64_47]], implicit [[V_ADD_U32_e64_46]], implicit [[V_ADD_U32_e64_45]], implicit [[V_ADD_U32_e64_44]], implicit [[V_ADD_U32_e64_43]], implicit [[V_ADD_U32_e64_42]], implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_40]], implicit [[V_ADD_U32_e64_39]], implicit [[V_ADD_U32_e64_38]], implicit [[V_ADD_U32_e64_37]], implicit [[V_ADD_U32_e64_36]], implicit [[V_ADD_U32_e64_35]], implicit [[V_ADD_U32_e64_34]], implicit [[V_ADD_U32_e64_33]], implicit [[V_ADD_U32_e64_32]], implicit [[V_ADD_U32_e64_31]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_28]]
   ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
   ; GFX10-SUNK-NEXT: {{  $}}
   ; GFX10-SUNK-NEXT: bb.4:
@@ -801,6 +777,29 @@ body:             |
   ; GFX9-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
   ; GFX9-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
   ; GFX9-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
   ; GFX9-SUNK-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GFX9-SUNK-NEXT:   [[DEF6:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
   ; GFX9-SUNK-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
@@ -818,69 +817,23 @@ body:             |
   ; GFX9-SUNK-NEXT: bb.2:
   ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
   ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
   ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
-  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]]
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_26]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_24]]
   ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
   ; GFX9-SUNK-NEXT: {{  $}}
   ; GFX9-SUNK-NEXT: bb.3:
   ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
   ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_42:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_43:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_44:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_45:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_46:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_47:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_48:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_49:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_50:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_51:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_52:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_53:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
   ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
-  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_28]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_31]], implicit [[V_ADD_U32_e64_32]], implicit [[V_ADD_U32_e64_33]], implicit [[V_ADD_U32_e64_34]], implicit [[V_ADD_U32_e64_35]], implicit [[V_ADD_U32_e64_36]], implicit [[V_ADD_U32_e64_37]], implicit [[V_ADD_U32_e64_38]], implicit [[V_ADD_U32_e64_39]], implicit [[V_ADD_U32_e64_40]], implicit [[V_ADD_U32_e64_41]], implicit [[V_ADD_U32_e64_42]], implicit [[V_ADD_U32_e64_43]], implicit [[V_ADD_U32_e64_44]], implicit [[V_ADD_U32_e64_45]], implicit [[V_ADD_U32_e64_46]], implicit [[V_ADD_U32_e64_47]], implicit [[V_ADD_U32_e64_48]], implicit [[V_ADD_U32_e64_49]], implicit [[V_ADD_U32_e64_50]], implicit [[V_ADD_U32_e64_51]], implicit [[V_ADD_U32_e64_52]], implicit [[V_ADD_U32_e64_53]]
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_28]]
   ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
   ; GFX9-SUNK-NEXT: {{  $}}
   ; GFX9-SUNK-NEXT: bb.4:
@@ -920,69 +873,69 @@ body:             |
   ; GFX10-SUNK-NEXT: bb.2:
   ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
   ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
   ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
-  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]]
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_26]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_1]]
   ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
   ; GFX10-SUNK-NEXT: {{  $}}
   ; GFX10-SUNK-NEXT: bb.3:
   ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
   ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_42:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_43:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_44:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_45:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_46:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_47:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_48:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_49:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_50:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_51:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_52:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_53:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
   ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
-  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_28]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_31]], implicit [[V_ADD_U32_e64_32]], implicit [[V_ADD_U32_e64_33]], implicit [[V_ADD_U32_e64_34]], implicit [[V_ADD_U32_e64_35]], implicit [[V_ADD_U32_e64_36]], implicit [[V_ADD_U32_e64_37]], implicit [[V_ADD_U32_e64_38]], implicit [[V_ADD_U32_e64_39]], implicit [[V_ADD_U32_e64_40]], implicit [[V_ADD_U32_e64_41]], implicit [[V_ADD_U32_e64_42]], implicit [[V_ADD_U32_e64_43]], implicit [[V_ADD_U32_e64_44]], implicit [[V_ADD_U32_e64_45]], implicit [[V_ADD_U32_e64_46]], implicit [[V_ADD_U32_e64_47]], implicit [[V_ADD_U32_e64_48]], implicit [[V_ADD_U32_e64_49]], implicit [[V_ADD_U32_e64_50]], implicit [[V_ADD_U32_e64_51]], implicit [[V_ADD_U32_e64_52]], implicit [[V_ADD_U32_e64_53]]
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_42:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_43:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_44:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_45:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_46:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_47:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_48:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_49:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_50:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_51:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_52:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_53:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_41]], implicit [[V_ADD_U32_e64_53]], implicit [[V_ADD_U32_e64_52]], implicit [[V_ADD_U32_e64_51]], implicit [[V_ADD_U32_e64_50]], implicit [[V_ADD_U32_e64_49]], implicit [[V_ADD_U32_e64_48]], implicit [[V_ADD_U32_e64_47]], implicit [[V_ADD_U32_e64_46]], implicit [[V_ADD_U32_e64_45]], implicit [[V_ADD_U32_e64_44]], implicit [[V_ADD_U32_e64_43]], implicit [[V_ADD_U32_e64_42]], implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_40]], implicit [[V_ADD_U32_e64_39]], implicit [[V_ADD_U32_e64_38]], implicit [[V_ADD_U32_e64_37]], implicit [[V_ADD_U32_e64_36]], implicit [[V_ADD_U32_e64_35]], implicit [[V_ADD_U32_e64_34]], implicit [[V_ADD_U32_e64_33]], implicit [[V_ADD_U32_e64_32]], implicit [[V_ADD_U32_e64_31]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_28]]
   ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
   ; GFX10-SUNK-NEXT: {{  $}}
   ; GFX10-SUNK-NEXT: bb.4:
@@ -1078,19 +1031,6 @@ body:             |
   ; GFX9-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
   ; GFX9-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
   ; GFX9-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
-  ; GFX9-SUNK-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF]].sub2, [[DEF1]].sub4, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.1:
-  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_CO_U32_e64_]], implicit [[V_ADD_CO_U32_e64_1]]
-  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.3
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.2:
-  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
-  ; GFX9-SUNK-NEXT: {{  $}}
   ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
   ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
   ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
@@ -1104,56 +1044,46 @@ body:             |
   ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
   ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
   ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF]].sub2, [[DEF1]].sub4, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.1:
+  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_CO_U32_e64_]], implicit [[V_ADD_CO_U32_e64_1]]
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.3
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.2:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
   ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
-  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]]
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_26]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_24]]
   ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
   ; GFX9-SUNK-NEXT: {{  $}}
   ; GFX9-SUNK-NEXT: bb.3:
   ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
   ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_42:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_43:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_44:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_45:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_46:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_47:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_48:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_49:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_50:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_51:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_52:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_53:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
   ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
-  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_28]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_31]], implicit [[V_ADD_U32_e64_32]], implicit [[V_ADD_U32_e64_33]], implicit [[V_ADD_U32_e64_34]], implicit [[V_ADD_U32_e64_35]], implicit [[V_ADD_U32_e64_36]], implicit [[V_ADD_U32_e64_37]], implicit [[V_ADD_U32_e64_38]], implicit [[V_ADD_U32_e64_39]], implicit [[V_ADD_U32_e64_40]], implicit [[V_ADD_U32_e64_41]], implicit [[V_ADD_U32_e64_42]], implicit [[V_ADD_U32_e64_43]], implicit [[V_ADD_U32_e64_44]], implicit [[V_ADD_U32_e64_45]], implicit [[V_ADD_U32_e64_46]], implicit [[V_ADD_U32_e64_47]], implicit [[V_ADD_U32_e64_48]], implicit [[V_ADD_U32_e64_49]], implicit [[V_ADD_U32_e64_50]], implicit [[V_ADD_U32_e64_51]], implicit [[V_ADD_U32_e64_52]], implicit [[V_ADD_U32_e64_53]]
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_28]]
   ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
   ; GFX9-SUNK-NEXT: {{  $}}
   ; GFX9-SUNK-NEXT: bb.4:
@@ -1189,69 +1119,69 @@ body:             |
   ; GFX10-SUNK-NEXT: bb.2:
   ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
   ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
   ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
-  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]]
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_26]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_1]]
   ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
   ; GFX10-SUNK-NEXT: {{  $}}
   ; GFX10-SUNK-NEXT: bb.3:
   ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
   ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_42:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_43:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_44:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_45:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_46:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_47:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_48:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_49:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_50:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_51:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_52:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_53:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
   ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
-  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_28]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_31]], implicit [[V_ADD_U32_e64_32]], implicit [[V_ADD_U32_e64_33]], implicit [[V_ADD_U32_e64_34]], implicit [[V_ADD_U32_e64_35]], implicit [[V_ADD_U32_e64_36]], implicit [[V_ADD_U32_e64_37]], implicit [[V_ADD_U32_e64_38]], implicit [[V_ADD_U32_e64_39]], implicit [[V_ADD_U32_e64_40]], implicit [[V_ADD_U32_e64_41]], implicit [[V_ADD_U32_e64_42]], implicit [[V_ADD_U32_e64_43]], implicit [[V_ADD_U32_e64_44]], implicit [[V_ADD_U32_e64_45]], implicit [[V_ADD_U32_e64_46]], implicit [[V_ADD_U32_e64_47]], implicit [[V_ADD_U32_e64_48]], implicit [[V_ADD_U32_e64_49]], implicit [[V_ADD_U32_e64_50]], implicit [[V_ADD_U32_e64_51]], implicit [[V_ADD_U32_e64_52]], implicit [[V_ADD_U32_e64_53]]
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_42:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_43:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_44:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_45:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_46:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_47:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_48:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_49:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_50:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_51:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_52:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_53:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_41]], implicit [[V_ADD_U32_e64_53]], implicit [[V_ADD_U32_e64_52]], implicit [[V_ADD_U32_e64_51]], implicit [[V_ADD_U32_e64_50]], implicit [[V_ADD_U32_e64_49]], implicit [[V_ADD_U32_e64_48]], implicit [[V_ADD_U32_e64_47]], implicit [[V_ADD_U32_e64_46]], implicit [[V_ADD_U32_e64_45]], implicit [[V_ADD_U32_e64_44]], implicit [[V_ADD_U32_e64_43]], implicit [[V_ADD_U32_e64_42]], implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_40]], implicit [[V_ADD_U32_e64_39]], implicit [[V_ADD_U32_e64_38]], implicit [[V_ADD_U32_e64_37]], implicit [[V_ADD_U32_e64_36]], implicit [[V_ADD_U32_e64_35]], implicit [[V_ADD_U32_e64_34]], implicit [[V_ADD_U32_e64_33]], implicit [[V_ADD_U32_e64_32]], implicit [[V_ADD_U32_e64_31]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_28]]
   ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
   ; GFX10-SUNK-NEXT: {{  $}}
   ; GFX10-SUNK-NEXT: bb.4:
@@ -1346,12 +1276,12 @@ body:             |
   ; GFX9-SUNK-NEXT:   [[DEF7:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
   ; GFX9-SUNK-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GFX9-SUNK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF8]], [[DEF9]], 0, implicit $exec
   ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
   ; GFX9-SUNK-NEXT: {{  $}}
   ; GFX9-SUNK-NEXT: bb.1:
   ; GFX9-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF8]], [[DEF9]], 0, implicit $exec
   ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]]
   ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]]
   ; GFX9-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
@@ -1376,12 +1306,12 @@ body:             |
   ; GFX10-SUNK-NEXT:   [[DEF7:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
   ; GFX10-SUNK-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GFX10-SUNK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF8]], [[DEF9]], 0, implicit $exec
   ; GFX10-SUNK-NEXT:   S_BRANCH %bb.1
   ; GFX10-SUNK-NEXT: {{  $}}
   ; GFX10-SUNK-NEXT: bb.1:
   ; GFX10-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF8]], [[DEF9]], 0, implicit $exec
   ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]]
   ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]]
   ; GFX10-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
@@ -1438,13 +1368,13 @@ body:             |
   ; GFX9-SUNK-NEXT:   [[DEF7:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
   ; GFX9-SUNK-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GFX9-SUNK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF8]], [[DEF9]], 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_MUL_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e64 [[V_ADD_U32_e64_]], [[DEF9]], 0, implicit $exec
   ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
   ; GFX9-SUNK-NEXT: {{  $}}
   ; GFX9-SUNK-NEXT: bb.1:
   ; GFX9-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF8]], [[DEF9]], 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_MUL_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e64 [[V_ADD_U32_e64_]], [[DEF9]], 0, implicit $exec
   ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]]
   ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_MUL_U32_U24_e64_]]
   ; GFX9-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
@@ -1469,13 +1399,13 @@ body:             |
   ; GFX10-SUNK-NEXT:   [[DEF7:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
   ; GFX10-SUNK-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GFX10-SUNK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF8]], [[DEF9]], 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_MUL_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e64 [[V_ADD_U32_e64_]], [[DEF9]], 0, implicit $exec
   ; GFX10-SUNK-NEXT:   S_BRANCH %bb.1
   ; GFX10-SUNK-NEXT: {{  $}}
   ; GFX10-SUNK-NEXT: bb.1:
   ; GFX10-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF8]], [[DEF9]], 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_MUL_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e64 [[V_ADD_U32_e64_]], [[DEF9]], 0, implicit $exec
   ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]]
   ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_MUL_U32_U24_e64_]]
   ; GFX10-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
@@ -1676,7 +1606,6 @@ body:             |
   ; GFX9-SUNK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GFX9-SUNK-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF9]], [[DEF9]], 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF10]], [[DEF10]], 0, implicit $exec
   ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
   ; GFX9-SUNK-NEXT: {{  $}}
   ; GFX9-SUNK-NEXT: bb.1:
@@ -1689,6 +1618,7 @@ body:             |
   ; GFX9-SUNK-NEXT: bb.2:
   ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
   ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF10]], [[DEF10]], 0, implicit $exec
   ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]]
   ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
   ; GFX9-SUNK-NEXT: {{  $}}
@@ -1736,9 +1666,9 @@ body:             |
   ; GFX10-SUNK-NEXT: bb.2:
   ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
   ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF9]], [[DEF9]], 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF10]], [[DEF10]], 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]]
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF10]], [[DEF10]], 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF9]], [[DEF9]], 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_]]
   ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
   ; GFX10-SUNK-NEXT: {{  $}}
   ; GFX10-SUNK-NEXT: bb.3:
@@ -1813,8 +1743,6 @@ body:             |
   ; GFX9-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
   ; GFX9-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GFX9-SUNK-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF4]], [[DEF5]], 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_MUL_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e64 [[V_ADD_U32_e64_]], [[DEF5]], 0, implicit $exec
   ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
   ; GFX9-SUNK-NEXT: {{  $}}
   ; GFX9-SUNK-NEXT: bb.1:
@@ -1825,6 +1753,8 @@ body:             |
   ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, def %9
   ; GFX9-SUNK-NEXT:   [[COPY:%[0-9]+]]:vreg_256_align2 = COPY %8
   ; GFX9-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vreg_256_align2 = COPY %9
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF4]], [[DEF5]], 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_MUL_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e64 [[V_ADD_U32_e64_]], [[DEF5]], 0, implicit $exec
   ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_MUL_U32_U24_e64_]], implicit %8, implicit %9, implicit [[COPY]], implicit [[COPY1]], implicit [[DEF4]], implicit [[DEF5]]
   ; GFX9-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX9-SUNK-NEXT:   S_BRANCH %bb.2
@@ -1852,13 +1782,13 @@ body:             |
   ; GFX10-SUNK-NEXT: bb.1:
   ; GFX10-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF4]], [[DEF5]], 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_MUL_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e64 [[V_ADD_U32_e64_]], [[DEF5]], 0, implicit $exec
   ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]]
   ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, def %8
   ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, def %9
   ; GFX10-SUNK-NEXT:   [[COPY:%[0-9]+]]:vreg_256 = COPY %8
   ; GFX10-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vreg_256 = COPY %9
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF4]], [[DEF5]], 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_MUL_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e64 [[V_ADD_U32_e64_]], [[DEF5]], 0, implicit $exec
   ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_MUL_U32_U24_e64_]], implicit %8, implicit %9, implicit [[COPY]], implicit [[COPY1]], implicit [[DEF4]], implicit [[DEF5]]
   ; GFX10-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX10-SUNK-NEXT:   S_BRANCH %bb.2
diff --git a/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir b/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir
index f23afe52f97de..e3986831d996b 100644
--- a/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir
+++ b/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir
@@ -3,19 +3,6 @@
 
 # Test kill flags are cleared after sinking operations into cycle during MachineLICM.
 
---- |
-  target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-a:8:16-n32:64"
-  target triple = "s390x-unknown-linux-gnu"
-
-  %0 = type <{ i32, [3 x i8] }>
-
-  @b = external dso_local global [1 x %0], align 2
-
-  define dso_local void @c() local_unnamed_addr {
-    ret void
-  }
-
-...
 ---
 name:            c
 tracksRegLiveness: true
@@ -24,54 +11,26 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[LARL:%[0-9]+]]:addr64bit = LARL @b
-  ; CHECK-NEXT:   [[LA:%[0-9]+]]:gr64bit = LA killed [[LARL]], 49, $noreg
-  ; CHECK-NEXT:   [[LGHI:%[0-9]+]]:gr64bit = LGHI 7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:gr64bit = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:gr64bit = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64bit = COPY [[LA]]
-  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0
-  ; CHECK-NEXT:   $r2d = COPY [[DEF]]
-  ; CHECK-NEXT:   $r3d = COPY [[COPY]]
-  ; CHECK-NEXT:   $r4d = COPY [[LGHI]]
-  ; CHECK-NEXT:   CallBRASL &memcpy, $r2d, $r3d, $r4d, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $r2d
-  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0
-  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0
-  ; CHECK-NEXT:   $r2d = COPY [[DEF1]]
-  ; CHECK-NEXT:   $r3d = COPY [[COPY]]
-  ; CHECK-NEXT:   $r4d = COPY [[LGHI]]
-  ; CHECK-NEXT:   CallBRASL &memcpy, $r2d, $r3d, $r4d, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $r2d
-  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64bit = COPY [[DEF]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64bit = COPY [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY1]], implicit [[DEF]]
   ; CHECK-NEXT:   J %bb.1
   bb.0:
     successors: %bb.1(0x80000000)
 
-    %1:addr64bit = LARL @b
-    %0:gr64bit = LA killed %1, 49, $noreg
-    %2:gr64bit = LGHI 7
-    %3:gr64bit = IMPLICIT_DEF
-    %5:gr64bit = IMPLICIT_DEF
-    %6:gr64bit = COPY killed %0
+    %7:gr64bit = IMPLICIT_DEF
+    %8:gr64bit = COPY %7
+    %9:gr64bit = COPY killed %8
 
   bb.1:
     successors: %bb.1(0x80000000)
 
-    ADJCALLSTACKDOWN 0, 0
-    $r2d = COPY %3
-    $r3d = COPY %6
-    $r4d = COPY %2
-    CallBRASL &memcpy, $r2d, $r3d, $r4d, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $r2d
-    ADJCALLSTACKUP 0, 0
-    ADJCALLSTACKDOWN 0, 0
-    $r2d = COPY %5
-    $r3d = COPY %6
-    $r4d = COPY %2
-    CallBRASL &memcpy, $r2d, $r3d, $r4d, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $r2d
-    ADJCALLSTACKUP 0, 0
+    INLINEASM &"", 1 , implicit %9, implicit %7
     J %bb.1
 
 ...



More information about the llvm-commits mailing list