[llvm] 4a36e96 - RegAllocGreedy: Account for reserved registers in num regs heuristic

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 14 18:00:50 PDT 2021


Author: Matt Arsenault
Date: 2021-09-14T21:00:29-04:00
New Revision: 4a36e96c3fc2a9128097bfc4f907ccebc5dc66af

URL: https://github.com/llvm/llvm-project/commit/4a36e96c3fc2a9128097bfc4f907ccebc5dc66af
DIFF: https://github.com/llvm/llvm-project/commit/4a36e96c3fc2a9128097bfc4f907ccebc5dc66af.diff

LOG: RegAllocGreedy: Account for reserved registers in num regs heuristic

This simple heuristic uses the estimated live range length combined
with the number of registers in the class to switch which heuristic to
use. This was taking the raw number of registers in the class, even
though not all of them may be available. AMDGPU heavily relies on
dynamically reserved numbers of registers based on user attributes to
satisfy occupancy constraints, so the raw number is highly misleading.

There are still a few problems here. In the original testcase that
made me notice this, the live range size is incorrect after the
scheduler rearranges instructions, since the instructions don't have
the original InstrDist offsets. Additionally, I think it would be more
appropriate to use the number of disjointly allocatable registers in
the class. For the AMDGPU register tuples, there are a large number of
registers in each tuple class, but only a small fraction can actually
be allocated at the same time since they all overlap with each
other. It seems we do not have a query that corresponds to the number
of independently allocatable registers. Relatedly, I'm still debugging
some allocation failures where overlapping tuples seem to not be
handled correctly.

The test changes are mostly noise. There are a handful of x86 tests
that look like regressions with an additional spill, and a handful
that now avoid a spill. The worst looking regression is likely
test/Thumb2/mve-vld4.ll which introduces a few additional
spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
shows a massive improvement by completely eliminating a large number
of spills inside a loop.

Added: 
    llvm/test/CodeGen/AMDGPU/greedy-global-heuristic.mir

Modified: 
    llvm/lib/CodeGen/RegAllocGreedy.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
    llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
    llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
    llvm/test/CodeGen/AMDGPU/frem.ll
    llvm/test/CodeGen/AMDGPU/half.ll
    llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
    llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
    llvm/test/CodeGen/AMDGPU/load-global-i16.ll
    llvm/test/CodeGen/AMDGPU/sdiv64.ll
    llvm/test/CodeGen/AMDGPU/shl.ll
    llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
    llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
    llvm/test/CodeGen/AMDGPU/srl.ll
    llvm/test/CodeGen/ARM/fptosi-sat-scalar.ll
    llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll
    llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll
    llvm/test/CodeGen/Hexagon/reg-scavengebug-2.ll
    llvm/test/CodeGen/Mips/cconv/vector.ll
    llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll
    llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
    llvm/test/CodeGen/RISCV/rv32zbp.ll
    llvm/test/CodeGen/RISCV/rv64zbp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
    llvm/test/CodeGen/RISCV/stack-store-check.ll
    llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
    llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
    llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
    llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll
    llvm/test/CodeGen/X86/2007-10-12-SpillerUnfold1.ll
    llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll
    llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll
    llvm/test/CodeGen/X86/abs.ll
    llvm/test/CodeGen/X86/avx512-calling-conv.ll
    llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
    llvm/test/CodeGen/X86/avx512-select.ll
    llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
    llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
    llvm/test/CodeGen/X86/bitreverse.ll
    llvm/test/CodeGen/X86/bool-vector.ll
    llvm/test/CodeGen/X86/bswap.ll
    llvm/test/CodeGen/X86/build-vector-128.ll
    llvm/test/CodeGen/X86/clear-highbits.ll
    llvm/test/CodeGen/X86/combine-sbb.ll
    llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
    llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
    llvm/test/CodeGen/X86/fp128-cast.ll
    llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
    llvm/test/CodeGen/X86/fshr.ll
    llvm/test/CodeGen/X86/funnel-shift-rot.ll
    llvm/test/CodeGen/X86/funnel-shift.ll
    llvm/test/CodeGen/X86/gather-addresses.ll
    llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
    llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
    llvm/test/CodeGen/X86/horizontal-reduce-smax.ll
    llvm/test/CodeGen/X86/horizontal-reduce-smin.ll
    llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
    llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
    llvm/test/CodeGen/X86/i128-mul.ll
    llvm/test/CodeGen/X86/i128-sdiv.ll
    llvm/test/CodeGen/X86/i256-add.ll
    llvm/test/CodeGen/X86/i64-to-float.ll
    llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
    llvm/test/CodeGen/X86/known-signbits-vector.ll
    llvm/test/CodeGen/X86/legalize-shl-vec.ll
    llvm/test/CodeGen/X86/load-combine.ll
    llvm/test/CodeGen/X86/masked_gather_scatter.ll
    llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
    llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll
    llvm/test/CodeGen/X86/mmx-arith.ll
    llvm/test/CodeGen/X86/mul-constant-i64.ll
    llvm/test/CodeGen/X86/mul-constant-result.ll
    llvm/test/CodeGen/X86/mul-i1024.ll
    llvm/test/CodeGen/X86/mul-i256.ll
    llvm/test/CodeGen/X86/mul-i512.ll
    llvm/test/CodeGen/X86/mul128.ll
    llvm/test/CodeGen/X86/neg-abs.ll
    llvm/test/CodeGen/X86/nontemporal.ll
    llvm/test/CodeGen/X86/nosse-vector.ll
    llvm/test/CodeGen/X86/overflow.ll
    llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
    llvm/test/CodeGen/X86/popcnt.ll
    llvm/test/CodeGen/X86/pr31088.ll
    llvm/test/CodeGen/X86/pr32284.ll
    llvm/test/CodeGen/X86/pr32329.ll
    llvm/test/CodeGen/X86/pr32610.ll
    llvm/test/CodeGen/X86/pr34080-2.ll
    llvm/test/CodeGen/X86/pr46527.ll
    llvm/test/CodeGen/X86/sadd_sat.ll
    llvm/test/CodeGen/X86/scheduler-backtracking.ll
    llvm/test/CodeGen/X86/sdiv_fix.ll
    llvm/test/CodeGen/X86/sdiv_fix_sat.ll
    llvm/test/CodeGen/X86/select.ll
    llvm/test/CodeGen/X86/setcc-wide-types.ll
    llvm/test/CodeGen/X86/shrink_vmul.ll
    llvm/test/CodeGen/X86/smax.ll
    llvm/test/CodeGen/X86/smin.ll
    llvm/test/CodeGen/X86/smul_fix.ll
    llvm/test/CodeGen/X86/smul_fix_sat.ll
    llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
    llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
    llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
    llvm/test/CodeGen/X86/sshl_sat.ll
    llvm/test/CodeGen/X86/sshl_sat_vec.ll
    llvm/test/CodeGen/X86/stack-align-memcpy.ll
    llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll
    llvm/test/CodeGen/X86/subvector-broadcast.ll
    llvm/test/CodeGen/X86/uadd_sat.ll
    llvm/test/CodeGen/X86/udiv_fix_sat.ll
    llvm/test/CodeGen/X86/umax.ll
    llvm/test/CodeGen/X86/umin.ll
    llvm/test/CodeGen/X86/umul-with-overflow.ll
    llvm/test/CodeGen/X86/umul_fix.ll
    llvm/test/CodeGen/X86/umul_fix_sat.ll
    llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
    llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
    llvm/test/CodeGen/X86/ushl_sat.ll
    llvm/test/CodeGen/X86/ushl_sat_vec.ll
    llvm/test/CodeGen/X86/usub_sat.ll
    llvm/test/CodeGen/X86/vec-strict-cmp-128.ll
    llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll
    llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
    llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
    llvm/test/CodeGen/X86/vec_shift4.ll
    llvm/test/CodeGen/X86/vec_umulo.ll
    llvm/test/CodeGen/X86/vector-fshl-128.ll
    llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
    llvm/test/CodeGen/X86/vector-fshr-128.ll
    llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
    llvm/test/CodeGen/X86/vector-gep.ll
    llvm/test/CodeGen/X86/vector-idiv-v2i32.ll
    llvm/test/CodeGen/X86/vector-lzcnt-128.ll
    llvm/test/CodeGen/X86/vector-rotate-128.ll
    llvm/test/CodeGen/X86/vector-sext.ll
    llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
    llvm/test/CodeGen/X86/vector-shift-shl-256.ll
    llvm/test/CodeGen/X86/vector-trunc-ssat.ll
    llvm/test/CodeGen/X86/vector-tzcnt-128.ll
    llvm/test/CodeGen/X86/vshift-6.ll
    llvm/test/CodeGen/X86/widen_cast-4.ll
    llvm/test/CodeGen/X86/xmulo.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 688c543ac6274..258946a03208d 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -762,7 +762,7 @@ void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) {
     bool ReverseLocal = TRI->reverseLocalAssignment();
     const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
     bool ForceGlobal = !ReverseLocal &&
-      (Size / SlotIndex::InstrDist) > (2 * RC.getNumRegs());
+      (Size / SlotIndex::InstrDist) > (2 * RCI.getNumAllocatableRegs(&RC));
 
     if (ExtraRegInfo[Reg].Stage == RS_Assign && !ForceGlobal && !LI->empty() &&
         LIS->intervalIsInOneMBB(*LI)) {

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
index d4c1670b1c56d..3e8b22638324d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@@ -507,238 +507,197 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    s_mov_b32 s4, s33
 ; GCN-NEXT:    s_add_i32 s33, s32, 0x3fc0
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xffffc000
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
-; GCN-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[15:18], v[0:1], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[19:22], v[0:1], off offset:64
-; GCN-NEXT:    global_load_dwordx4 v[23:26], v[0:1], off offset:80
-; GCN-NEXT:    global_load_dwordx4 v[27:30], v[0:1], off offset:96
-; GCN-NEXT:    global_load_dwordx4 v[31:34], v[0:1], off offset:112
-; GCN-NEXT:    global_load_dwordx4 v[35:38], v[0:1], off offset:128
-; GCN-NEXT:    global_load_dwordx4 v[39:42], v[0:1], off offset:144
-; GCN-NEXT:    global_load_dwordx4 v[43:46], v[0:1], off offset:160
-; GCN-NEXT:    global_load_dwordx4 v[47:50], v[0:1], off offset:176
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:64
+; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:80
+; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:96
+; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:112
+; GCN-NEXT:    global_load_dwordx4 v[44:47], v[0:1], off offset:128
+; GCN-NEXT:    global_load_dwordx4 v[48:51], v[0:1], off offset:144
+; GCN-NEXT:    global_load_dwordx4 v[52:55], v[0:1], off offset:160
+; GCN-NEXT:    global_load_dwordx4 v[56:59], v[0:1], off offset:176
+; GCN-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:192
 ; GCN-NEXT:    s_add_i32 s32, s32, 0x10000
 ; GCN-NEXT:    s_add_i32 s32, s32, 0xffff0000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[47:50], v[0:1], off offset:192
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[51:54], v[0:1], off offset:208
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:208
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[55:58], v[0:1], off offset:224
-; GCN-NEXT:    global_load_dwordx4 v[59:62], v[0:1], off offset:240
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:256
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:260
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:264
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:268
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:272
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:276
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:280
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:284
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:288
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:292
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:296
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:300
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:304
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:308
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:312
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:316
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:320
-; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:324
-; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:328
-; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:332
-; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:336
-; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:340
-; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:344
-; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:348
-; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:352
-; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:356
-; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:360
-; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:364
-; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:368
-; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:372
-; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:376
-; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:380
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:384
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:388
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:392
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:396
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:400
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:404
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:408
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:412
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:416
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:420
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:424
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:428
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:224
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:240
 ; GCN-NEXT:    v_and_b32_e32 v0, 31, v2
 ; GCN-NEXT:    v_lshrrev_b32_e64 v2, 6, s33
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GCN-NEXT:    v_add_u32_e32 v2, 0x100, v2
 ; GCN-NEXT:    v_add_u32_e32 v1, v2, v0
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:256
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:260
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:264
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:268
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:272
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:276
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:280
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:284
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:288
+; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:292
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:296
+; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:300
+; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:304
+; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:308
+; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:312
+; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:316
+; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:320
+; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:324
+; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:328
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:332
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:336
+; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:340
+; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:344
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:348
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:352
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:356
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:360
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:364
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:368
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:372
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:376
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:380
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:384
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:388
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:392
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:396
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:400
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:404
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:408
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:412
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:416
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:420
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:424
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:428
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:432
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:436
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:440
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:444
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v12, v15
-; GCN-NEXT:    v_mov_b32_e32 v13, v16
-; GCN-NEXT:    v_mov_b32_e32 v14, v17
-; GCN-NEXT:    v_mov_b32_e32 v15, v18
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:432
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:436
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:440
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:444
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v3, v16
+; GCN-NEXT:    v_mov_b32_e32 v4, v17
+; GCN-NEXT:    v_mov_b32_e32 v5, v18
+; GCN-NEXT:    v_mov_b32_e32 v6, v19
 ; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:448
 ; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:452
 ; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:456
 ; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:460
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v4, v7
-; GCN-NEXT:    v_mov_b32_e32 v5, v8
-; GCN-NEXT:    v_mov_b32_e32 v6, v9
-; GCN-NEXT:    v_mov_b32_e32 v7, v10
+; GCN-NEXT:    v_mov_b32_e32 v4, v20
+; GCN-NEXT:    v_mov_b32_e32 v5, v21
+; GCN-NEXT:    v_mov_b32_e32 v6, v22
+; GCN-NEXT:    v_mov_b32_e32 v7, v23
 ; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:464
 ; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
 ; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
 ; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:480
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:484
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:488
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:492
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:496
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:500
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:504
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:508
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:496
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:500
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:504
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:508
 ; GCN-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
-; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index fd97da40302d9..97858b3dae67c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -1076,10 +1076,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-LABEL: s_mul_i256:
 ; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_mov_b32 s16, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX7-NEXT:    v_mul_hi_u32 v0, s16, v0
 ; GFX7-NEXT:    s_mul_i32 s17, s1, s8
-; GFX7-NEXT:    s_mul_i32 s18, s0, s9
+; GFX7-NEXT:    s_mul_i32 s18, s16, s9
 ; GFX7-NEXT:    s_add_u32 s17, s17, s18
 ; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s17, v0
@@ -1092,11 +1093,11 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_add_u32 s17, s17, s18
 ; GFX7-NEXT:    v_mul_hi_u32 v2, v2, s8
 ; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX7-NEXT:    s_mul_i32 s19, s0, s10
+; GFX7-NEXT:    s_mul_i32 s19, s16, s10
 ; GFX7-NEXT:    s_and_b32 s18, s18, 1
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX7-NEXT:    s_add_u32 s17, s17, s19
-; GFX7-NEXT:    v_mul_hi_u32 v4, s0, v3
+; GFX7-NEXT:    v_mul_hi_u32 v4, s16, v3
 ; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    s_and_b32 s19, s19, 1
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, s17, v2
@@ -1120,7 +1121,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_mul_hi_u32 v5, v4, s8
 ; GFX7-NEXT:    s_and_b32 s19, s19, 1
-; GFX7-NEXT:    s_mul_i32 s20, s0, s11
+; GFX7-NEXT:    s_mul_i32 s20, s16, s11
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    s_add_u32 s17, s17, s20
 ; GFX7-NEXT:    v_mul_hi_u32 v3, s1, v3
@@ -1130,7 +1131,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s10
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX7-NEXT:    v_mul_hi_u32 v7, s0, v6
+; GFX7-NEXT:    v_mul_hi_u32 v7, s16, v6
 ; GFX7-NEXT:    v_add_i32_e32 v8, vcc, s18, v8
 ; GFX7-NEXT:    s_mul_i32 s17, s4, s8
 ; GFX7-NEXT:    s_mul_i32 s18, s3, s9
@@ -1157,7 +1158,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_mul_hi_u32 v7, v5, s8
 ; GFX7-NEXT:    s_and_b32 s19, s19, 1
-; GFX7-NEXT:    s_mul_i32 s21, s0, s12
+; GFX7-NEXT:    s_mul_i32 s21, s16, s12
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    s_add_u32 s17, s17, s21
 ; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
@@ -1177,7 +1178,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX7-NEXT:    s_mul_i32 s19, s3, s10
 ; GFX7-NEXT:    s_and_b32 s18, s18, 1
-; GFX7-NEXT:    v_mul_hi_u32 v10, s0, v9
+; GFX7-NEXT:    v_mul_hi_u32 v10, s16, v9
 ; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
 ; GFX7-NEXT:    s_add_u32 s17, s17, s19
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
@@ -1203,7 +1204,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_mul_hi_u32 v8, v7, s8
 ; GFX7-NEXT:    s_and_b32 s19, s19, 1
-; GFX7-NEXT:    s_mul_i32 s22, s0, s13
+; GFX7-NEXT:    s_mul_i32 s22, s16, s13
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    s_add_u32 s17, s17, s22
 ; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
@@ -1232,7 +1233,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX7-NEXT:    s_mul_i32 s20, s3, s11
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
-; GFX7-NEXT:    v_mul_hi_u32 v13, s0, v12
+; GFX7-NEXT:    v_mul_hi_u32 v13, s16, v12
 ; GFX7-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; GFX7-NEXT:    s_add_u32 s17, s17, s20
 ; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
@@ -1258,7 +1259,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_mul_hi_u32 v10, v8, s8
 ; GFX7-NEXT:    s_and_b32 s19, s19, 1
-; GFX7-NEXT:    s_mul_i32 s23, s0, s14
+; GFX7-NEXT:    s_mul_i32 s23, s16, s14
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    s_add_u32 s17, s17, s23
 ; GFX7-NEXT:    v_mul_hi_u32 v11, v7, s9
@@ -1280,7 +1281,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
 ; GFX7-NEXT:    v_mov_b32_e32 v15, s13
 ; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX7-NEXT:    v_mul_hi_u32 v16, s0, v15
+; GFX7-NEXT:    v_mul_hi_u32 v16, s16, v15
 ; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
 ; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
@@ -1290,40 +1291,40 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; GFX7-NEXT:    s_mul_i32 s7, s7, s8
 ; GFX7-NEXT:    s_mul_i32 s17, s6, s9
-; GFX7-NEXT:    v_mov_b32_e32 v13, s14
-; GFX7-NEXT:    s_mul_i32 s16, s0, s8
 ; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GFX7-NEXT:    s_mul_i32 s5, s5, s10
-; GFX7-NEXT:    s_mul_i32 s15, s0, s15
-; GFX7-NEXT:    v_mul_hi_u32 v13, s0, v13
-; GFX7-NEXT:    s_add_i32 s0, s7, s17
-; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX7-NEXT:    s_mul_i32 s4, s4, s11
-; GFX7-NEXT:    s_add_i32 s0, s0, s5
-; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
 ; GFX7-NEXT:    s_mul_i32 s11, s3, s12
-; GFX7-NEXT:    v_mov_b32_e32 v10, s6
-; GFX7-NEXT:    s_add_i32 s0, s0, s4
 ; GFX7-NEXT:    s_mul_i32 s12, s2, s13
-; GFX7-NEXT:    v_mul_hi_u32 v10, v10, s8
-; GFX7-NEXT:    s_add_i32 s0, s0, s11
 ; GFX7-NEXT:    s_mul_i32 s13, s1, s14
+; GFX7-NEXT:    v_mul_hi_u32 v11, s2, v12
+; GFX7-NEXT:    v_mul_hi_u32 v12, s1, v15
+; GFX7-NEXT:    s_add_i32 s1, s7, s17
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX7-NEXT:    s_add_i32 s1, s1, s5
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
+; GFX7-NEXT:    v_mov_b32_e32 v10, s6
+; GFX7-NEXT:    s_add_i32 s1, s1, s4
+; GFX7-NEXT:    v_mul_hi_u32 v10, v10, s8
+; GFX7-NEXT:    s_add_i32 s1, s1, s11
 ; GFX7-NEXT:    v_mul_hi_u32 v8, v8, s9
-; GFX7-NEXT:    s_add_i32 s0, s0, s12
+; GFX7-NEXT:    s_add_i32 s1, s1, s12
+; GFX7-NEXT:    s_mul_i32 s15, s16, s15
 ; GFX7-NEXT:    v_mul_hi_u32 v7, v7, s10
-; GFX7-NEXT:    s_add_i32 s0, s0, s13
+; GFX7-NEXT:    s_add_i32 s1, s1, s13
 ; GFX7-NEXT:    v_mul_hi_u32 v9, s3, v9
-; GFX7-NEXT:    s_add_i32 s0, s0, s15
-; GFX7-NEXT:    v_mul_hi_u32 v11, s2, v12
-; GFX7-NEXT:    v_add_i32_e32 v10, vcc, s0, v10
-; GFX7-NEXT:    v_mul_hi_u32 v12, s1, v15
+; GFX7-NEXT:    s_add_i32 s1, s1, s15
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, s1, v10
+; GFX7-NEXT:    v_mov_b32_e32 v13, s14
 ; GFX7-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GFX7-NEXT:    v_mul_hi_u32 v13, s16, v13
 ; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
 ; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
 ; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
 ; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GFX7-NEXT:    s_mul_i32 s0, s0, s8
 ; GFX7-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s3, v2
@@ -1331,15 +1332,15 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    v_readfirstlane_b32 s5, v4
 ; GFX7-NEXT:    v_readfirstlane_b32 s6, v5
 ; GFX7-NEXT:    v_readfirstlane_b32 s7, v6
-; GFX7-NEXT:    s_mov_b32 s0, s16
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_mul_i256:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_mov_b32 s16, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s8
-; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, s16, v0
 ; GFX8-NEXT:    s_mul_i32 s17, s1, s8
-; GFX8-NEXT:    s_mul_i32 s18, s0, s9
+; GFX8-NEXT:    s_mul_i32 s18, s16, s9
 ; GFX8-NEXT:    s_add_u32 s17, s17, s18
 ; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s17, v0
@@ -1352,11 +1353,11 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_add_u32 s17, s17, s18
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v2, s8
 ; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX8-NEXT:    s_mul_i32 s19, s0, s10
+; GFX8-NEXT:    s_mul_i32 s19, s16, s10
 ; GFX8-NEXT:    s_and_b32 s18, s18, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX8-NEXT:    s_add_u32 s17, s17, s19
-; GFX8-NEXT:    v_mul_hi_u32 v4, s0, v3
+; GFX8-NEXT:    v_mul_hi_u32 v4, s16, v3
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    s_and_b32 s19, s19, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s17, v2
@@ -1380,7 +1381,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_mul_hi_u32 v5, v4, s8
 ; GFX8-NEXT:    s_and_b32 s19, s19, 1
-; GFX8-NEXT:    s_mul_i32 s20, s0, s11
+; GFX8-NEXT:    s_mul_i32 s20, s16, s11
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    s_add_u32 s17, s17, s20
 ; GFX8-NEXT:    v_mul_hi_u32 v3, s1, v3
@@ -1390,7 +1391,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s10
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_mul_hi_u32 v7, s0, v6
+; GFX8-NEXT:    v_mul_hi_u32 v7, s16, v6
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s18, v8
 ; GFX8-NEXT:    s_mul_i32 s17, s4, s8
 ; GFX8-NEXT:    s_mul_i32 s18, s3, s9
@@ -1417,7 +1418,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_mul_hi_u32 v7, v5, s8
 ; GFX8-NEXT:    s_and_b32 s19, s19, 1
-; GFX8-NEXT:    s_mul_i32 s21, s0, s12
+; GFX8-NEXT:    s_mul_i32 s21, s16, s12
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    s_add_u32 s17, s17, s21
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
@@ -1437,7 +1438,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    s_mul_i32 s19, s3, s10
 ; GFX8-NEXT:    s_and_b32 s18, s18, 1
-; GFX8-NEXT:    v_mul_hi_u32 v10, s0, v9
+; GFX8-NEXT:    v_mul_hi_u32 v10, s16, v9
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v11, v7
 ; GFX8-NEXT:    s_add_u32 s17, s17, s19
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
@@ -1463,7 +1464,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_mul_hi_u32 v8, v7, s8
 ; GFX8-NEXT:    s_and_b32 s19, s19, 1
-; GFX8-NEXT:    s_mul_i32 s22, s0, s13
+; GFX8-NEXT:    s_mul_i32 s22, s16, s13
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    s_add_u32 s17, s17, s22
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
@@ -1492,7 +1493,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX8-NEXT:    s_mul_i32 s20, s3, s11
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
-; GFX8-NEXT:    v_mul_hi_u32 v13, s0, v12
+; GFX8-NEXT:    v_mul_hi_u32 v13, s16, v12
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v10, v8
 ; GFX8-NEXT:    s_add_u32 s17, s17, s20
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v11
@@ -1518,7 +1519,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_mul_hi_u32 v10, v8, s8
 ; GFX8-NEXT:    s_and_b32 s19, s19, 1
-; GFX8-NEXT:    s_mul_i32 s23, s0, s14
+; GFX8-NEXT:    s_mul_i32 s23, s16, s14
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    s_add_u32 s17, s17, s23
 ; GFX8-NEXT:    v_mul_hi_u32 v11, v7, s9
@@ -1540,7 +1541,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v13
 ; GFX8-NEXT:    v_mov_b32_e32 v15, s13
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX8-NEXT:    v_mul_hi_u32 v16, s0, v15
+; GFX8-NEXT:    v_mul_hi_u32 v16, s16, v15
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v14
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
@@ -1550,40 +1551,40 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
 ; GFX8-NEXT:    s_mul_i32 s7, s7, s8
 ; GFX8-NEXT:    s_mul_i32 s17, s6, s9
-; GFX8-NEXT:    v_mov_b32_e32 v13, s14
-; GFX8-NEXT:    s_mul_i32 s16, s0, s8
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v6
 ; GFX8-NEXT:    s_mul_i32 s5, s5, s10
-; GFX8-NEXT:    s_mul_i32 s15, s0, s15
-; GFX8-NEXT:    v_mul_hi_u32 v13, s0, v13
-; GFX8-NEXT:    s_add_i32 s0, s7, s17
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX8-NEXT:    s_mul_i32 s4, s4, s11
-; GFX8-NEXT:    s_add_i32 s0, s0, s5
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v10, v6
 ; GFX8-NEXT:    s_mul_i32 s11, s3, s12
-; GFX8-NEXT:    v_mov_b32_e32 v10, s6
-; GFX8-NEXT:    s_add_i32 s0, s0, s4
 ; GFX8-NEXT:    s_mul_i32 s12, s2, s13
-; GFX8-NEXT:    v_mul_hi_u32 v10, v10, s8
-; GFX8-NEXT:    s_add_i32 s0, s0, s11
 ; GFX8-NEXT:    s_mul_i32 s13, s1, s14
+; GFX8-NEXT:    v_mul_hi_u32 v11, s2, v12
+; GFX8-NEXT:    v_mul_hi_u32 v12, s1, v15
+; GFX8-NEXT:    s_add_i32 s1, s7, s17
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT:    s_add_i32 s1, s1, s5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v10, v6
+; GFX8-NEXT:    v_mov_b32_e32 v10, s6
+; GFX8-NEXT:    s_add_i32 s1, s1, s4
+; GFX8-NEXT:    v_mul_hi_u32 v10, v10, s8
+; GFX8-NEXT:    s_add_i32 s1, s1, s11
 ; GFX8-NEXT:    v_mul_hi_u32 v8, v8, s9
-; GFX8-NEXT:    s_add_i32 s0, s0, s12
+; GFX8-NEXT:    s_add_i32 s1, s1, s12
+; GFX8-NEXT:    s_mul_i32 s15, s16, s15
 ; GFX8-NEXT:    v_mul_hi_u32 v7, v7, s10
-; GFX8-NEXT:    s_add_i32 s0, s0, s13
+; GFX8-NEXT:    s_add_i32 s1, s1, s13
 ; GFX8-NEXT:    v_mul_hi_u32 v9, s3, v9
-; GFX8-NEXT:    s_add_i32 s0, s0, s15
-; GFX8-NEXT:    v_mul_hi_u32 v11, s2, v12
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s0, v10
-; GFX8-NEXT:    v_mul_hi_u32 v12, s1, v15
+; GFX8-NEXT:    s_add_i32 s1, s1, s15
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s1, v10
+; GFX8-NEXT:    v_mov_b32_e32 v13, s14
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v10, v8
+; GFX8-NEXT:    v_mul_hi_u32 v13, s16, v13
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v11
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v12
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v13
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
+; GFX8-NEXT:    s_mul_i32 s0, s0, s8
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
@@ -1591,16 +1592,16 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    v_readfirstlane_b32 s5, v4
 ; GFX8-NEXT:    v_readfirstlane_b32 s6, v5
 ; GFX8-NEXT:    v_readfirstlane_b32 s7, v6
-; GFX8-NEXT:    s_mov_b32 s0, s16
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_mul_i256:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s16, s0
 ; GFX9-NEXT:    s_mul_i32 s17, s1, s8
-; GFX9-NEXT:    s_mul_i32 s18, s0, s9
+; GFX9-NEXT:    s_mul_i32 s18, s16, s9
 ; GFX9-NEXT:    s_add_u32 s17, s17, s18
 ; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX9-NEXT:    s_mul_hi_u32 s19, s0, s8
+; GFX9-NEXT:    s_mul_hi_u32 s19, s16, s8
 ; GFX9-NEXT:    s_and_b32 s18, s18, 1
 ; GFX9-NEXT:    s_add_u32 s17, s17, s19
 ; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
@@ -1610,7 +1611,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_mul_i32 s20, s1, s9
 ; GFX9-NEXT:    s_add_u32 s19, s19, s20
 ; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX9-NEXT:    s_mul_i32 s21, s0, s10
+; GFX9-NEXT:    s_mul_i32 s21, s16, s10
 ; GFX9-NEXT:    s_and_b32 s20, s20, 1
 ; GFX9-NEXT:    s_add_u32 s19, s19, s21
 ; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
@@ -1620,7 +1621,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_add_u32 s19, s19, s22
 ; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX9-NEXT:    s_and_b32 s21, s21, 1
-; GFX9-NEXT:    s_mul_hi_u32 s23, s0, s9
+; GFX9-NEXT:    s_mul_hi_u32 s23, s16, s9
 ; GFX9-NEXT:    s_add_i32 s20, s20, s21
 ; GFX9-NEXT:    s_add_u32 s19, s19, s23
 ; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
@@ -1639,7 +1640,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_add_u32 s19, s19, s22
 ; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX9-NEXT:    s_and_b32 s22, s22, 1
-; GFX9-NEXT:    s_mul_i32 s23, s0, s11
+; GFX9-NEXT:    s_mul_i32 s23, s16, s11
 ; GFX9-NEXT:    s_add_i32 s21, s21, s22
 ; GFX9-NEXT:    s_add_u32 s19, s19, s23
 ; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
@@ -1654,7 +1655,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_add_u32 s19, s19, s25
 ; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX9-NEXT:    s_and_b32 s22, s22, 1
-; GFX9-NEXT:    s_mul_hi_u32 s26, s0, s10
+; GFX9-NEXT:    s_mul_hi_u32 s26, s16, s10
 ; GFX9-NEXT:    s_add_i32 s21, s21, s22
 ; GFX9-NEXT:    s_add_u32 s19, s19, s26
 ; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
@@ -1678,7 +1679,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_add_u32 s20, s20, s24
 ; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
 ; GFX9-NEXT:    s_and_b32 s23, s23, 1
-; GFX9-NEXT:    s_mul_i32 s25, s0, s12
+; GFX9-NEXT:    s_mul_i32 s25, s16, s12
 ; GFX9-NEXT:    s_add_i32 s22, s22, s23
 ; GFX9-NEXT:    s_add_u32 s20, s20, s25
 ; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
@@ -1698,7 +1699,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_add_u32 s20, s20, s28
 ; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
 ; GFX9-NEXT:    s_and_b32 s23, s23, 1
-; GFX9-NEXT:    s_mul_hi_u32 s29, s0, s11
+; GFX9-NEXT:    s_mul_hi_u32 s29, s16, s11
 ; GFX9-NEXT:    s_add_i32 s22, s22, s23
 ; GFX9-NEXT:    s_add_u32 s20, s20, s29
 ; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
@@ -1727,7 +1728,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_add_u32 s21, s21, s26
 ; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX9-NEXT:    s_and_b32 s24, s24, 1
-; GFX9-NEXT:    s_mul_i32 s27, s0, s13
+; GFX9-NEXT:    s_mul_i32 s27, s16, s13
 ; GFX9-NEXT:    s_add_i32 s23, s23, s24
 ; GFX9-NEXT:    s_add_u32 s21, s21, s27
 ; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
@@ -1752,7 +1753,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_add_u32 s21, s21, s31
 ; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX9-NEXT:    s_and_b32 s24, s24, 1
-; GFX9-NEXT:    s_mul_hi_u32 s33, s0, s12
+; GFX9-NEXT:    s_mul_hi_u32 s33, s16, s12
 ; GFX9-NEXT:    s_add_i32 s23, s23, s24
 ; GFX9-NEXT:    s_add_u32 s21, s21, s33
 ; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
@@ -1786,7 +1787,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_add_u32 s22, s22, s28
 ; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
 ; GFX9-NEXT:    s_and_b32 s25, s25, 1
-; GFX9-NEXT:    s_mul_i32 s29, s0, s14
+; GFX9-NEXT:    s_mul_i32 s29, s16, s14
 ; GFX9-NEXT:    s_add_i32 s24, s24, s25
 ; GFX9-NEXT:    s_add_u32 s22, s22, s29
 ; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
@@ -1816,7 +1817,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_add_u32 s22, s22, s35
 ; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
 ; GFX9-NEXT:    s_and_b32 s25, s25, 1
-; GFX9-NEXT:    s_mul_hi_u32 s36, s0, s13
+; GFX9-NEXT:    s_mul_hi_u32 s36, s16, s13
 ; GFX9-NEXT:    s_add_i32 s24, s24, s25
 ; GFX9-NEXT:    s_add_u32 s22, s22, s36
 ; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
@@ -1838,7 +1839,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_add_i32 s7, s7, s27
 ; GFX9-NEXT:    s_mul_i32 s29, s1, s14
 ; GFX9-NEXT:    s_add_i32 s7, s7, s28
-; GFX9-NEXT:    s_mul_i32 s15, s0, s15
+; GFX9-NEXT:    s_mul_i32 s15, s16, s15
 ; GFX9-NEXT:    s_add_i32 s7, s7, s29
 ; GFX9-NEXT:    s_mul_hi_u32 s6, s6, s8
 ; GFX9-NEXT:    s_add_i32 s7, s7, s15
@@ -1852,12 +1853,11 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_add_i32 s3, s4, s3
 ; GFX9-NEXT:    s_mul_hi_u32 s1, s1, s13
 ; GFX9-NEXT:    s_add_i32 s2, s3, s2
-; GFX9-NEXT:    s_mul_i32 s16, s0, s8
-; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s14
+; GFX9-NEXT:    s_mul_i32 s0, s0, s8
+; GFX9-NEXT:    s_mul_hi_u32 s8, s16, s14
 ; GFX9-NEXT:    s_add_i32 s1, s2, s1
-; GFX9-NEXT:    s_add_i32 s0, s1, s0
-; GFX9-NEXT:    s_add_i32 s7, s0, s24
-; GFX9-NEXT:    s_mov_b32 s0, s16
+; GFX9-NEXT:    s_add_i32 s1, s1, s8
+; GFX9-NEXT:    s_add_i32 s7, s1, s24
 ; GFX9-NEXT:    s_mov_b32 s1, s17
 ; GFX9-NEXT:    s_mov_b32 s2, s18
 ; GFX9-NEXT:    s_mov_b32 s3, s19

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 47e2475a73189..ecbc31ee2d8df 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -538,14 +538,14 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v13, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v14, v4, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
-; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v7
+; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v7
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v5
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v6, v6, v5
-; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v5
-; GISEL-NEXT:    v_xor_b32_e32 v4, v11, v8
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v4
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v4, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v6, v6, v4
+; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v4
+; GISEL-NEXT:    v_xor_b32_e32 v5, v11, v8
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v6
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v7
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
@@ -565,7 +565,7 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v11, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v11, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v8
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v5
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v9, v15
@@ -624,19 +624,19 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v11, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v3, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v2, v9
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v4, v2, v8
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v5, v2, v8
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v3, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v3, v8
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v2, v9
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
@@ -644,14 +644,15 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v3, v9
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_lo_u32 v9, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v9, v7, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v8
-; GISEL-NEXT:    v_mul_hi_u32 v13, v6, v4
-; GISEL-NEXT:    v_mul_lo_u32 v12, v6, v4
+; GISEL-NEXT:    v_mul_hi_u32 v13, v6, v5
+; GISEL-NEXT:    v_mul_lo_u32 v12, v6, v5
+; GISEL-NEXT:    v_xor_b32_e32 v4, v10, v4
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
@@ -665,7 +666,7 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v11, v7
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 1, v4
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 1, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, v9, v12, s[4:5]
 ; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, 0, v8, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
@@ -680,8 +681,7 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v12, v6, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v4, v10, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
@@ -1063,32 +1063,32 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-LABEL: v_sdiv_i64_pow2k_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x1000
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, 0x1000
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v4, 0
 ; CHECK-NEXT:    s_movk_i32 s6, 0xf000
-; CHECK-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v4
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v2
+; CHECK-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; CHECK-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; CHECK-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
+; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CHECK-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
-; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v2
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v4
-; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v2
+; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v3
+; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v3
 ; CHECK-NEXT:    s_bfe_i32 s7, -1, 0x10000
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
-; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v5
-; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v7
+; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v7
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
@@ -1096,7 +1096,7 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT:    v_mul_hi_u32 v8, v2, v5
+; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v5
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
@@ -1107,18 +1107,18 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CHECK-NEXT:    v_addc_u32_e64 v6, s[4:5], v4, v5, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, -1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, -1, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v8, s6, v6
-; CHECK-NEXT:    v_mul_hi_u32 v10, s6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v9, s6, v2
+; CHECK-NEXT:    v_mul_hi_u32 v10, s6, v3
+; CHECK-NEXT:    v_mul_lo_u32 v9, s6, v3
 ; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v5
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v8
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v10
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v9
-; CHECK-NEXT:    v_mul_lo_u32 v10, v2, v7
-; CHECK-NEXT:    v_mul_hi_u32 v5, v2, v9
+; CHECK-NEXT:    v_mul_lo_u32 v10, v3, v7
+; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v9
 ; CHECK-NEXT:    v_mul_hi_u32 v9, v6, v9
 ; CHECK-NEXT:    s_movk_i32 s6, 0x1000
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
@@ -1127,7 +1127,7 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v7
 ; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
-; CHECK-NEXT:    v_mul_hi_u32 v10, v2, v7
+; CHECK-NEXT:    v_mul_hi_u32 v10, v3, v7
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
@@ -1139,12 +1139,12 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v9, v8
 ; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
 ; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
@@ -1152,20 +1152,20 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v6, v0, v4
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CHECK-NEXT:    v_mul_lo_u32 v5, 0, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, 0, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v4
-; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v2
+; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v3
+; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v3
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
@@ -1178,7 +1178,7 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_mov_b32_e32 v7, s7
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v6
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, 1, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
 ; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, 0, v4, vcc
 ; CHECK-NEXT:    s_bfe_i32 s4, -1, 0x10000
@@ -1193,12 +1193,12 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v1, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v2
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v2
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 %num, 4096
   ret i64 %result
@@ -1217,73 +1217,73 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_mov_b32 s7, s6
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s8
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s9
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s8
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s9
 ; GISEL-NEXT:    s_sub_u32 s11, 0, s8
 ; GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GISEL-NEXT:    s_and_b32 s4, s4, 1
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GISEL-NEXT:    s_subb_u32 s12, 0, s9
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
+; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_mul_lo_u32 v7, s12, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, s11, v5
-; GISEL-NEXT:    v_mul_hi_u32 v10, s11, v4
-; GISEL-NEXT:    v_mul_lo_u32 v9, s11, v4
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, s12, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s11, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, s11, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, s11, v5
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v9
-; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v6
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v5, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, s12, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v6, v7, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v9, s12, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v10, s11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, s11, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, s11, v4
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v12, s11, v5
+; GISEL-NEXT:    v_mul_lo_u32 v11, s11, v5
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v11
+; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
-; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
+; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
@@ -1294,35 +1294,35 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v1, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v10, v0, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v1, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, v0, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, v0, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GISEL-NEXT:    v_mov_b32_e32 v9, s9
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v5
+; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v6
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v8, v0, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; GISEL-NEXT:    v_mul_hi_u32 v8, v0, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v6, v1, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_mul_lo_u32 v7, s9, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v11, s8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v10, s8, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_mul_lo_u32 v7, s9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v6
+; GISEL-NEXT:    v_mul_hi_u32 v11, s8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v10, s8, v5
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
@@ -1336,8 +1336,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s9, v8
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v5
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v6, vcc
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
@@ -1357,79 +1357,79 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s7
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s7
 ; GISEL-NEXT:    s_sub_u32 s8, 0, s6
 ; GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GISEL-NEXT:    s_and_b32 s4, s4, 1
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GISEL-NEXT:    s_subb_u32 s9, 0, s7
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v6
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
+; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_mul_lo_u32 v7, s9, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v10, s8, v4
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
-; GISEL-NEXT:    v_mul_lo_u32 v9, s8, v4
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, s9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, s8, v5
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GISEL-NEXT:    v_mul_lo_u32 v9, s8, v5
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v9
-; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v5, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, s9, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v6, v7, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v9, s9, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v10, s8, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, s8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, s8, v4
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v12, s8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v11, s8, v5
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v11
+; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v6
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
-; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
+; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
@@ -1440,35 +1440,35 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v2, v5
-; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, v2, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
 ; GISEL-NEXT:    v_mov_b32_e32 v9, s7
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v5
+; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v6
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v8, v2, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; GISEL-NEXT:    v_mul_hi_u32 v8, v2, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_mul_lo_u32 v7, s7, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, s6, v5
-; GISEL-NEXT:    v_mul_hi_u32 v11, s6, v4
-; GISEL-NEXT:    v_mul_lo_u32 v10, s6, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_mul_lo_u32 v7, s7, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s6, v6
+; GISEL-NEXT:    v_mul_hi_u32 v11, s6, v5
+; GISEL-NEXT:    v_mul_lo_u32 v10, s6, v5
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
@@ -1482,8 +1482,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s7, v8
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v5
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v6, vcc
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
@@ -1497,34 +1497,34 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v6
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v4
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i64_pow2k_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, 0x1000
+; CGP-NEXT:    v_cvt_f32_u32_e32 v5, 0x1000
 ; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
 ; CGP-NEXT:    s_movk_i32 s6, 0xf000
-; CGP-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
-; CGP-NEXT:    v_mov_b32_e32 v7, v4
+; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; CGP-NEXT:    v_mov_b32_e32 v7, v5
 ; CGP-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v6
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; CGP-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
 ; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v7
 ; CGP-NEXT:    v_trunc_f32_e32 v8, v8
 ; CGP-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v8
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v5
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v9, -1, v7
 ; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
 ; CGP-NEXT:    v_mul_hi_u32 v12, s6, v7
@@ -1567,7 +1567,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v14, v7, v11
 ; CGP-NEXT:    v_mul_hi_u32 v9, v7, v13
 ; CGP-NEXT:    v_mul_hi_u32 v13, v10, v13
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
+; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
@@ -1592,7 +1592,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v10, v0, v8
 ; CGP-NEXT:    v_mul_hi_u32 v11, v0, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v1, v7
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
@@ -1614,7 +1614,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v10, s7, v8
 ; CGP-NEXT:    v_mul_hi_u32 v12, s7, v7
 ; CGP-NEXT:    v_mul_lo_u32 v11, s7, v7
-; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v11
@@ -1643,33 +1643,33 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v11, v12, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
-; CGP-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v4
+; CGP-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v5
 ; CGP-NEXT:    v_trunc_f32_e32 v7, v7
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
-; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v7
+; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
 ; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
-; CGP-NEXT:    v_mul_lo_u32 v8, -1, v4
+; CGP-NEXT:    v_mul_lo_u32 v8, -1, v5
 ; CGP-NEXT:    v_mul_lo_u32 v9, s6, v7
-; CGP-NEXT:    v_mul_hi_u32 v11, s6, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, s6, v4
+; CGP-NEXT:    v_mul_hi_u32 v11, s6, v5
+; CGP-NEXT:    v_mul_lo_u32 v10, s6, v5
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; CGP-NEXT:    v_mul_lo_u32 v9, v7, v10
-; CGP-NEXT:    v_mul_lo_u32 v11, v4, v8
-; CGP-NEXT:    v_mul_hi_u32 v12, v4, v10
+; CGP-NEXT:    v_mul_lo_u32 v11, v5, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, v5, v10
 ; CGP-NEXT:    v_mul_hi_u32 v10, v7, v10
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v5
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v12, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; CGP-NEXT:    v_mul_hi_u32 v11, v4, v8
+; CGP-NEXT:    v_mul_hi_u32 v11, v5, v8
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
@@ -1680,18 +1680,18 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; CGP-NEXT:    v_addc_u32_e64 v9, s[4:5], v7, v8, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, -1, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, -1, v5
 ; CGP-NEXT:    v_mul_lo_u32 v11, s6, v9
-; CGP-NEXT:    v_mul_hi_u32 v13, s6, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, s6, v4
+; CGP-NEXT:    v_mul_hi_u32 v13, s6, v5
+; CGP-NEXT:    v_mul_lo_u32 v12, s6, v5
 ; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v8
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
 ; CGP-NEXT:    v_mul_lo_u32 v11, v9, v12
-; CGP-NEXT:    v_mul_lo_u32 v13, v4, v10
-; CGP-NEXT:    v_mul_hi_u32 v8, v4, v12
+; CGP-NEXT:    v_mul_lo_u32 v13, v5, v10
+; CGP-NEXT:    v_mul_hi_u32 v8, v5, v12
 ; CGP-NEXT:    v_mul_hi_u32 v12, v9, v12
 ; CGP-NEXT:    v_xor_b32_e32 v2, v2, v6
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
@@ -1700,7 +1700,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; CGP-NEXT:    v_mul_lo_u32 v11, v9, v10
 ; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v13, v8
-; CGP-NEXT:    v_mul_hi_u32 v13, v4, v10
+; CGP-NEXT:    v_mul_hi_u32 v13, v5, v10
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
@@ -1712,30 +1712,30 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v11
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
 ; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v6
 ; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
-; CGP-NEXT:    v_mul_lo_u32 v8, v3, v4
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
+; CGP-NEXT:    v_mul_lo_u32 v8, v3, v5
 ; CGP-NEXT:    v_mul_lo_u32 v9, v2, v7
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; CGP-NEXT:    v_mul_hi_u32 v5, v2, v4
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; CGP-NEXT:    v_mul_hi_u32 v4, v2, v5
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v8, v3, v7
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v3, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
 ; CGP-NEXT:    v_mul_hi_u32 v9, v2, v7
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
@@ -1786,32 +1786,32 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-LABEL: v_sdiv_i64_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x12d8fb
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, 0x12d8fb
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s6, 0xffed2705
-; CHECK-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v4
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v2
+; CHECK-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; CHECK-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; CHECK-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
+; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CHECK-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
-; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v2
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v4
-; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v2
+; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v3
+; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v3
 ; CHECK-NEXT:    s_bfe_i32 s7, -1, 0x10000
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
-; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v5
-; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v7
+; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v7
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
@@ -1819,7 +1819,7 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT:    v_mul_hi_u32 v8, v2, v5
+; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v5
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
@@ -1830,18 +1830,18 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CHECK-NEXT:    v_addc_u32_e64 v6, s[4:5], v4, v5, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, -1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, -1, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v8, s6, v6
-; CHECK-NEXT:    v_mul_hi_u32 v10, s6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v9, s6, v2
+; CHECK-NEXT:    v_mul_hi_u32 v10, s6, v3
+; CHECK-NEXT:    v_mul_lo_u32 v9, s6, v3
 ; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v5
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v8
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v10
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v9
-; CHECK-NEXT:    v_mul_lo_u32 v10, v2, v7
-; CHECK-NEXT:    v_mul_hi_u32 v5, v2, v9
+; CHECK-NEXT:    v_mul_lo_u32 v10, v3, v7
+; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v9
 ; CHECK-NEXT:    v_mul_hi_u32 v9, v6, v9
 ; CHECK-NEXT:    s_mov_b32 s6, 0x12d8fb
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
@@ -1850,7 +1850,7 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v7
 ; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
-; CHECK-NEXT:    v_mul_hi_u32 v10, v2, v7
+; CHECK-NEXT:    v_mul_hi_u32 v10, v3, v7
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
@@ -1862,12 +1862,12 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v9, v8
 ; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
 ; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
@@ -1875,20 +1875,20 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v6, v0, v4
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CHECK-NEXT:    v_mul_lo_u32 v5, 0, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, 0, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v4
-; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v2
+; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v3
+; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v3
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
@@ -1901,7 +1901,7 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_mov_b32_e32 v7, s7
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v6
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, 1, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
 ; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, 0, v4, vcc
 ; CHECK-NEXT:    s_bfe_i32 s4, -1, 0x10000
@@ -1916,12 +1916,12 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v1, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v2
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v2
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 %num, 1235195
   ret i64 %result
@@ -1940,73 +1940,73 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_mov_b32 s7, s6
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s8
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s9
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s8
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s9
 ; GISEL-NEXT:    s_sub_u32 s11, 0, s8
 ; GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GISEL-NEXT:    s_and_b32 s4, s4, 1
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GISEL-NEXT:    s_subb_u32 s12, 0, s9
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
+; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_mul_lo_u32 v7, s12, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, s11, v5
-; GISEL-NEXT:    v_mul_hi_u32 v10, s11, v4
-; GISEL-NEXT:    v_mul_lo_u32 v9, s11, v4
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, s12, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s11, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, s11, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, s11, v5
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v9
-; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v6
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v5, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, s12, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v6, v7, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v9, s12, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v10, s11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, s11, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, s11, v4
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v12, s11, v5
+; GISEL-NEXT:    v_mul_lo_u32 v11, s11, v5
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v11
+; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
-; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
+; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
@@ -2017,35 +2017,35 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v1, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v10, v0, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v1, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, v0, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, v0, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GISEL-NEXT:    v_mov_b32_e32 v9, s9
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v5
+; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v6
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v8, v0, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; GISEL-NEXT:    v_mul_hi_u32 v8, v0, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v6, v1, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_mul_lo_u32 v7, s9, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v11, s8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v10, s8, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_mul_lo_u32 v7, s9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v6
+; GISEL-NEXT:    v_mul_hi_u32 v11, s8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v10, s8, v5
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
@@ -2059,8 +2059,8 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s9, v8
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v5
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v6, vcc
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
@@ -2080,79 +2080,79 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s7
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s7
 ; GISEL-NEXT:    s_sub_u32 s8, 0, s6
 ; GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GISEL-NEXT:    s_and_b32 s4, s4, 1
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GISEL-NEXT:    s_subb_u32 s9, 0, s7
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v6
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
+; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_mul_lo_u32 v7, s9, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v10, s8, v4
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
-; GISEL-NEXT:    v_mul_lo_u32 v9, s8, v4
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, s9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, s8, v5
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GISEL-NEXT:    v_mul_lo_u32 v9, s8, v5
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v9
-; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v5, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, s9, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v6, v7, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v9, s9, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v10, s8, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, s8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, s8, v4
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v12, s8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v11, s8, v5
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v11
+; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v6
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
-; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
+; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
@@ -2163,35 +2163,35 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v2, v5
-; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, v2, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
 ; GISEL-NEXT:    v_mov_b32_e32 v9, s7
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v5
+; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v6
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v8, v2, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; GISEL-NEXT:    v_mul_hi_u32 v8, v2, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_mul_lo_u32 v7, s7, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, s6, v5
-; GISEL-NEXT:    v_mul_hi_u32 v11, s6, v4
-; GISEL-NEXT:    v_mul_lo_u32 v10, s6, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_mul_lo_u32 v7, s7, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s6, v6
+; GISEL-NEXT:    v_mul_hi_u32 v11, s6, v5
+; GISEL-NEXT:    v_mul_lo_u32 v10, s6, v5
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
@@ -2205,8 +2205,8 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s7, v8
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v5
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v6, vcc
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
@@ -2220,34 +2220,34 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v6
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v4
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i64_oddk_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
+; CGP-NEXT:    v_cvt_f32_u32_e32 v5, 0x12d8fb
 ; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
 ; CGP-NEXT:    s_mov_b32 s6, 0xffed2705
-; CGP-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
-; CGP-NEXT:    v_mov_b32_e32 v7, v4
+; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; CGP-NEXT:    v_mov_b32_e32 v7, v5
 ; CGP-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v6
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; CGP-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
 ; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v7
 ; CGP-NEXT:    v_trunc_f32_e32 v8, v8
 ; CGP-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v8
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v5
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v9, -1, v7
 ; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
 ; CGP-NEXT:    v_mul_hi_u32 v12, s6, v7
@@ -2290,7 +2290,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v14, v7, v11
 ; CGP-NEXT:    v_mul_hi_u32 v9, v7, v13
 ; CGP-NEXT:    v_mul_hi_u32 v13, v10, v13
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
+; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
@@ -2315,7 +2315,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v10, v0, v8
 ; CGP-NEXT:    v_mul_hi_u32 v11, v0, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v1, v7
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
@@ -2337,7 +2337,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v10, s7, v8
 ; CGP-NEXT:    v_mul_hi_u32 v12, s7, v7
 ; CGP-NEXT:    v_mul_lo_u32 v11, s7, v7
-; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v11
@@ -2366,33 +2366,33 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v11, v12, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
-; CGP-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v4
+; CGP-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v5
 ; CGP-NEXT:    v_trunc_f32_e32 v7, v7
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
-; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v7
+; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
 ; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
-; CGP-NEXT:    v_mul_lo_u32 v8, -1, v4
+; CGP-NEXT:    v_mul_lo_u32 v8, -1, v5
 ; CGP-NEXT:    v_mul_lo_u32 v9, s6, v7
-; CGP-NEXT:    v_mul_hi_u32 v11, s6, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, s6, v4
+; CGP-NEXT:    v_mul_hi_u32 v11, s6, v5
+; CGP-NEXT:    v_mul_lo_u32 v10, s6, v5
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; CGP-NEXT:    v_mul_lo_u32 v9, v7, v10
-; CGP-NEXT:    v_mul_lo_u32 v11, v4, v8
-; CGP-NEXT:    v_mul_hi_u32 v12, v4, v10
+; CGP-NEXT:    v_mul_lo_u32 v11, v5, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, v5, v10
 ; CGP-NEXT:    v_mul_hi_u32 v10, v7, v10
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v5
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v12, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; CGP-NEXT:    v_mul_hi_u32 v11, v4, v8
+; CGP-NEXT:    v_mul_hi_u32 v11, v5, v8
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
@@ -2403,18 +2403,18 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; CGP-NEXT:    v_addc_u32_e64 v9, s[4:5], v7, v8, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, -1, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, -1, v5
 ; CGP-NEXT:    v_mul_lo_u32 v11, s6, v9
-; CGP-NEXT:    v_mul_hi_u32 v13, s6, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, s6, v4
+; CGP-NEXT:    v_mul_hi_u32 v13, s6, v5
+; CGP-NEXT:    v_mul_lo_u32 v12, s6, v5
 ; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v8
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
 ; CGP-NEXT:    v_mul_lo_u32 v11, v9, v12
-; CGP-NEXT:    v_mul_lo_u32 v13, v4, v10
-; CGP-NEXT:    v_mul_hi_u32 v8, v4, v12
+; CGP-NEXT:    v_mul_lo_u32 v13, v5, v10
+; CGP-NEXT:    v_mul_hi_u32 v8, v5, v12
 ; CGP-NEXT:    v_mul_hi_u32 v12, v9, v12
 ; CGP-NEXT:    v_xor_b32_e32 v2, v2, v6
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
@@ -2423,7 +2423,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; CGP-NEXT:    v_mul_lo_u32 v11, v9, v10
 ; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v13, v8
-; CGP-NEXT:    v_mul_hi_u32 v13, v4, v10
+; CGP-NEXT:    v_mul_hi_u32 v13, v5, v10
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
@@ -2435,30 +2435,30 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v11
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
 ; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v6
 ; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
-; CGP-NEXT:    v_mul_lo_u32 v8, v3, v4
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
+; CGP-NEXT:    v_mul_lo_u32 v8, v3, v5
 ; CGP-NEXT:    v_mul_lo_u32 v9, v2, v7
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; CGP-NEXT:    v_mul_hi_u32 v5, v2, v4
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; CGP-NEXT:    v_mul_hi_u32 v4, v2, v5
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v8, v3, v7
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v3, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
 ; CGP-NEXT:    v_mul_hi_u32 v9, v2, v7
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
@@ -2703,28 +2703,28 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b64 s[6:7], 0x1000
-; GISEL-NEXT:    v_lshl_b64 v[4:5], s[6:7], v4
+; GISEL-NEXT:    v_lshl_b64 v[7:8], s[6:7], v4
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
-; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v4, v4, v7
-; GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v4
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v5
+; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v4
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v8, v4, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v5, v5, v4
+; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v4
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v5
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v7
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v10, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v9
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v8, v8
 ; GISEL-NEXT:    v_xor_b32_e32 v9, v0, v10
-; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v4
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v8
 ; GISEL-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v0
 ; GISEL-NEXT:    v_trunc_f32_e32 v8, v8
 ; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v8
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT:    v_subb_u32_e32 v12, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v12, vcc, 0, v7, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v12, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v11, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v11, v0
@@ -2808,51 +2808,51 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v6
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v8
-; GISEL-NEXT:    v_mul_hi_u32 v14, v4, v6
-; GISEL-NEXT:    v_mul_lo_u32 v13, v4, v6
+; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v6
+; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v8
+; GISEL-NEXT:    v_mul_hi_u32 v14, v5, v6
+; GISEL-NEXT:    v_mul_lo_u32 v13, v5, v6
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
 ; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v9, v13
 ; GISEL-NEXT:    v_subb_u32_e64 v12, s[4:5], v17, v11, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v11, s[4:5], v17, v11
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v5
-; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, v11, v5, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v7
+; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, v11, v7, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v4
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v9, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v5
+; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v9, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v7
 ; GISEL-NEXT:    v_subbrev_u32_e32 v11, vcc, 0, v11, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, v13, v14, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 1, v6
 ; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, 0, v8, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v11, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v15, v4, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v13
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v11, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v13
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v14, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v14, v9, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v13, v7, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v14, v9, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v6, v10, v7
-; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v8, v0, v7
-; GISEL-NEXT:    v_xor_b32_e32 v9, v1, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v7, v10, v4
+; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v8, v0, v4
+; GISEL-NEXT:    v_xor_b32_e32 v9, v1, v4
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v0, v8
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, v9
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
-; GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
-; GISEL-NEXT:    v_xor_b32_e32 v5, v5, v6
+; GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GISEL-NEXT:    v_xor_b32_e32 v6, v6, v7
 ; GISEL-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v10
@@ -2877,6 +2877,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_mul_lo_u32 v16, v0, v13
 ; GISEL-NEXT:    v_mul_hi_u32 v17, v0, v15
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v1, v15
+; GISEL-NEXT:    v_xor_b32_e32 v4, v10, v4
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
@@ -2929,47 +2930,47 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, 0, v1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v2, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v3, v12
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v4, v6
-; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v11
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v5, v6, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v13, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v2, v12
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v11
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v6, v7, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v13, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v6, v2, v12
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v11
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v12
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT:    v_mul_hi_u32 v7, v3, v12
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v12
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
-; GISEL-NEXT:    v_mul_lo_u32 v6, v9, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v4
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
+; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v6
+; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v5
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
 ; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v3, v12
-; GISEL-NEXT:    v_subb_u32_e64 v11, s[4:5], v2, v6, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v6
+; GISEL-NEXT:    v_subb_u32_e64 v11, s[4:5], v2, v7, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v7
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v9
 ; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v2, v9, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v8
 ; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v3, v8
 ; GISEL-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 1, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s[4:5]
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 1, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, v7, v12, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, 0, v6, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v8
@@ -2981,10 +2982,9 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v12, v8, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v4, v10, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
@@ -3433,56 +3433,56 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_subb_u32_e32 v8, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT:    v_and_b32_e32 v0, s6, v0
+; GISEL-NEXT:    v_and_b32_e32 v5, s6, v0
+; GISEL-NEXT:    v_and_b32_e32 v0, s6, v2
 ; GISEL-NEXT:    v_and_b32_e32 v6, s6, v6
-; GISEL-NEXT:    v_and_b32_e32 v2, s6, v2
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
+; GISEL-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v4
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v2
+; GISEL-NEXT:    v_trunc_f32_e32 v4, v4
+; GISEL-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_mul_lo_u32 v9, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v9, v8, v2
+; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v4
+; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v2
+; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v2
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v14, v4, v11
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, 0, v0
+; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v11
+; GISEL-NEXT:    v_mul_lo_u32 v12, v2, v9
+; GISEL-NEXT:    v_mul_hi_u32 v14, v2, v11
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 0, v5
 ; GISEL-NEXT:    v_addc_u32_e64 v13, s[4:5], 0, 0, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v14, v5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v11
+; GISEL-NEXT:    v_mul_lo_u32 v14, v4, v9
+; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v11
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
+; GISEL-NEXT:    v_mul_hi_u32 v12, v2, v9
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
-; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v4, v9
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT:    v_addc_u32_e64 v10, s[4:5], v5, v9, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v8, v4
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; GISEL-NEXT:    v_addc_u32_e64 v10, s[4:5], v4, v9, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v8, v8, v2
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v10
-; GISEL-NEXT:    v_mul_lo_u32 v12, v7, v4
-; GISEL-NEXT:    v_mul_hi_u32 v7, v7, v4
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v9
+; GISEL-NEXT:    v_mul_lo_u32 v12, v7, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v7, v2
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v9
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v10, v12
-; GISEL-NEXT:    v_mul_lo_u32 v11, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v9, v4, v12
+; GISEL-NEXT:    v_mul_lo_u32 v11, v2, v7
+; GISEL-NEXT:    v_mul_hi_u32 v9, v2, v12
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v10, v12
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
@@ -3490,7 +3490,7 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v7
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v7
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
@@ -3501,71 +3501,71 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v9
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v9, v0, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v4
+; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v4, v7, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v2
+; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v4
+; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v2
+; GISEL-NEXT:    v_mul_hi_u32 v2, v13, v2
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v13, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, v13, v4
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v8, v0, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; GISEL-NEXT:    v_mul_hi_u32 v8, v5, v4
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v5, v13, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v4
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v1, v5
-; GISEL-NEXT:    v_mul_hi_u32 v10, v1, v4
-; GISEL-NEXT:    v_mul_lo_u32 v9, v1, v4
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v2
+; GISEL-NEXT:    v_mul_lo_u32 v8, v1, v4
+; GISEL-NEXT:    v_mul_hi_u32 v10, v1, v2
+; GISEL-NEXT:    v_mul_lo_u32 v9, v1, v2
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
 ; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], v13, v7, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v7, s[4:5], v13, v7
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v3
 ; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v7, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v1
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v1
+; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v5, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v3
 ; GISEL-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, v9, v10, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v4
-; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v2
+; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v9
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v9
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v10, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 0, v6
 ; GISEL-NEXT:    v_addc_u32_e64 v7, s[4:5], 0, 0, vcc
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v6
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v12, v7
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v5, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v11, 0x4f800000, v12
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v3, v11
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v11
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GISEL-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GISEL-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; GISEL-NEXT:    v_trunc_f32_e32 v4, v4
 ; GISEL-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
@@ -3581,117 +3581,117 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v3, v9
-; GISEL-NEXT:    v_mul_hi_u32 v14, v3, v11
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0, v2
-; GISEL-NEXT:    v_addc_u32_e64 v13, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 0, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v3, v11
+; GISEL-NEXT:    v_addc_u32_e64 v14, s[4:5], 0, 0, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v14, v4, v9
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v11
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v3, v9
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v4, v9
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v10
-; GISEL-NEXT:    v_addc_u32_e64 v10, s[4:5], v4, v9, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v8, v3
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v10
-; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v3
-; GISEL-NEXT:    v_mul_hi_u32 v5, v5, v3
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; GISEL-NEXT:    v_addc_u32_e64 v3, s[4:5], v4, v9, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v8, v8, v0
+; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v3
+; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v0
+; GISEL-NEXT:    v_mul_hi_u32 v5, v5, v0
 ; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v9
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
+; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
 ; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, v10, v12
-; GISEL-NEXT:    v_mul_lo_u32 v11, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v9, v3, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v10, v12
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v8, v3, v11
+; GISEL-NEXT:    v_mul_lo_u32 v10, v0, v5
+; GISEL-NEXT:    v_mul_hi_u32 v9, v0, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v3, v11
+; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v5
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v11, v3, v5
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v9, v3, v5
+; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v10, v0, v5
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; GISEL-NEXT:    v_mul_hi_u32 v5, v10, v5
+; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v3, v3, v5
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v9
-; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
-; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v3
-; GISEL-NEXT:    v_mul_lo_u32 v8, v2, v4
-; GISEL-NEXT:    v_mul_hi_u32 v9, v2, v3
-; GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0, v0
-; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v9
+; GISEL-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v5
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v0, v8
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v5, v14, v4
+; GISEL-NEXT:    v_mul_lo_u32 v8, v13, v3
+; GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0, v1
+; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v2, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v2, v13, v4
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v5, v14, v3
+; GISEL-NEXT:    v_mul_hi_u32 v4, v14, v4
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
+; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v3
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v13, v4
-; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v3
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v8, v2, v4
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v4
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT:    v_mul_lo_u32 v5, v7, v3
-; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v4
-; GISEL-NEXT:    v_mul_hi_u32 v10, v6, v3
-; GISEL-NEXT:    v_mul_lo_u32 v9, v6, v3
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v9
-; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], v13, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
+; GISEL-NEXT:    v_mul_hi_u32 v3, v14, v3
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GISEL-NEXT:    v_mul_lo_u32 v4, v7, v2
+; GISEL-NEXT:    v_mul_lo_u32 v5, v6, v3
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v2
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v2
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v13, v8
+; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], v14, v4, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v4, s[4:5], v14, v4
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v7
-; GISEL-NEXT:    v_subb_u32_e32 v5, vcc, v5, v7, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v4, vcc, v4, v7, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v6
+; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v5, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v7
-; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, v9, v10, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v3
-; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v7
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v2
+; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v11, v5, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v9
 ; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v10, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v9, v5, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v9, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v5, v10, v6, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 0, v2
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 59db307e8a9bd..ee7f2a8caa749 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -1346,26 +1346,26 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_ashr_i32 s6, s9, 31
 ; GFX8-NEXT:    s_ashr_i32 s12, s1, 31
-; GFX8-NEXT:    s_add_u32 s8, s8, s6
+; GFX8-NEXT:    s_add_u32 s14, s8, s6
 ; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX8-NEXT:    s_and_b32 s7, s7, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s7, 0
-; GFX8-NEXT:    s_addc_u32 s9, s9, s6
+; GFX8-NEXT:    s_addc_u32 s15, s9, s6
 ; GFX8-NEXT:    s_add_u32 s0, s0, s12
 ; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX8-NEXT:    s_and_b32 s7, s7, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX8-NEXT:    s_mov_b32 s13, s12
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s12
-; GFX8-NEXT:    s_xor_b64 s[14:15], s[0:1], s[12:13]
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s15
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s14
+; GFX8-NEXT:    s_xor_b64 s[8:9], s[0:1], s[12:13]
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s9
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s8
 ; GFX8-NEXT:    s_mov_b32 s7, s6
-; GFX8-NEXT:    s_xor_b64 s[8:9], s[8:9], s[6:7]
+; GFX8-NEXT:    s_xor_b64 s[14:15], s[14:15], s[6:7]
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT:    s_sub_u32 s16, 0, s14
+; GFX8-NEXT:    s_sub_u32 s16, 0, s8
 ; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX8-NEXT:    s_and_b32 s0, s0, 1
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1376,12 +1376,12 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX8-NEXT:    s_subb_u32 s17, 0, s15
+; GFX8-NEXT:    s_subb_u32 s17, 0, s9
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s16, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s17, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v5, s16, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v4, s16, v0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s15
+; GFX8-NEXT:    v_mov_b32_e32 v6, s9
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v4
@@ -1438,55 +1438,55 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v2, s9, v0
-; GFX8-NEXT:    v_mul_lo_u32 v3, s8, v1
-; GFX8-NEXT:    v_mul_hi_u32 v5, s8, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s9
+; GFX8-NEXT:    v_mul_lo_u32 v2, s15, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, s14, v1
+; GFX8-NEXT:    v_mul_hi_u32 v5, s14, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, s15, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s15
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v5, s9, v1
+; GFX8-NEXT:    v_mul_lo_u32 v5, s15, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_mul_hi_u32 v3, s8, v1
+; GFX8-NEXT:    v_mul_hi_u32 v3, s14, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
-; GFX8-NEXT:    v_mul_hi_u32 v1, s9, v1
+; GFX8-NEXT:    v_mul_hi_u32 v1, s15, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_mul_lo_u32 v2, s15, v0
-; GFX8-NEXT:    v_mul_lo_u32 v3, s14, v1
-; GFX8-NEXT:    v_mul_hi_u32 v7, s14, v0
-; GFX8-NEXT:    v_mul_lo_u32 v5, s14, v0
+; GFX8-NEXT:    v_mul_lo_u32 v2, s9, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, s8, v1
+; GFX8-NEXT:    v_mul_hi_u32 v7, s8, v0
+; GFX8-NEXT:    v_mul_lo_u32 v5, s8, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s8, v5
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s14, v5
 ; GFX8-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v2, s[0:1], s9, v2
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v4
+; GFX8-NEXT:    v_sub_u32_e64 v2, s[0:1], s15, v2
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v3
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v4
 ; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, s14, v3
+; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, s8, v3
 ; GFX8-NEXT:    v_subbrev_u32_e64 v8, s[0:1], 0, v2, vcc
 ; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v0
 ; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v8
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v7
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
 ; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v8
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s14, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s8, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v9
 ; GFX8-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
@@ -1698,26 +1698,26 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s6, s9, 31
 ; GFX9-NEXT:    s_ashr_i32 s12, s1, 31
-; GFX9-NEXT:    s_add_u32 s8, s8, s6
+; GFX9-NEXT:    s_add_u32 s14, s8, s6
 ; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX9-NEXT:    s_and_b32 s7, s7, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s7, 0
-; GFX9-NEXT:    s_addc_u32 s9, s9, s6
+; GFX9-NEXT:    s_addc_u32 s15, s9, s6
 ; GFX9-NEXT:    s_add_u32 s0, s0, s12
 ; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX9-NEXT:    s_and_b32 s7, s7, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX9-NEXT:    s_mov_b32 s13, s12
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s12
-; GFX9-NEXT:    s_xor_b64 s[14:15], s[0:1], s[12:13]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s15
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s14
+; GFX9-NEXT:    s_xor_b64 s[8:9], s[0:1], s[12:13]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s6
-; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[6:7]
+; GFX9-NEXT:    s_xor_b64 s[14:15], s[14:15], s[6:7]
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_u32 s16, 0, s14
+; GFX9-NEXT:    s_sub_u32 s16, 0, s8
 ; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    s_and_b32 s0, s0, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1728,7 +1728,7 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX9-NEXT:    s_subb_u32 s17, 0, s15
+; GFX9-NEXT:    s_subb_u32 s17, 0, s9
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s16, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s17, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s16, v0
@@ -1785,19 +1785,19 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s9, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX9-NEXT:    v_mov_b32_e32 v7, s9
+; GFX9-NEXT:    v_mul_lo_u32 v2, s15, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s14, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s14, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s15, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, s15
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s9, v1
+; GFX9-NEXT:    v_mul_lo_u32 v4, s15, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, s9, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s14, v1
+; GFX9-NEXT:    v_mul_hi_u32 v1, s15, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
@@ -1806,33 +1806,33 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add3_u32 v1, v3, v2, v1
-; GFX9-NEXT:    v_mul_lo_u32 v2, s15, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s14, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s14, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, s14, v0
-; GFX9-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-NEXT:    v_mul_lo_u32 v2, s9, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
+; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s9
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v3, v4
-; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s8, v6
+; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s14, v6
 ; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v7, v2, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v4
-; GFX9-NEXT:    v_sub_u32_e32 v2, s9, v2
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
+; GFX9-NEXT:    v_sub_u32_e32 v2, s15, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v3
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v4
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s14, v3
+; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s8, v3
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, s[0:1], 0, v2, vcc
 ; GFX9-NEXT:    v_add_co_u32_e64 v9, s[0:1], 1, v0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[0:1], 0, v1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v8
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v7
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v8
-; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s14, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
+; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s8, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v12, s[0:1], 1, v9
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index a2efc7cec7a6c..676178c6de26c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -21,40 +21,40 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v0
 ; CHECK-NEXT:    v_addc_u32_e32 v2, vcc, v3, v0, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v0
+; CHECK-NEXT:    v_xor_b32_e32 v3, v1, v0
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v2, v0
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v1
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, v0
-; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v3
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v0
+; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v1
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v6
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v6
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v5, v6, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; CHECK-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v2
-; CHECK-NEXT:    v_trunc_f32_e32 v5, v5
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v5
+; CHECK-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v2
+; CHECK-NEXT:    v_trunc_f32_e32 v6, v6
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v6
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, 0, v1
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; CHECK-NEXT:    v_subb_u32_e32 v8, vcc, 0, v0, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v8, v2
-; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v5
+; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v12, v7, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v2
-; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v6
+; CHECK-NEXT:    v_xor_b32_e32 v4, v4, v1
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; CHECK-NEXT:    v_mul_lo_u32 v10, v5, v11
+; CHECK-NEXT:    v_mul_lo_u32 v10, v6, v11
 ; CHECK-NEXT:    v_mul_lo_u32 v12, v2, v9
 ; CHECK-NEXT:    v_mul_hi_u32 v13, v2, v11
-; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v11
-; CHECK-NEXT:    v_xor_b32_e32 v4, v4, v6
+; CHECK-NEXT:    v_mul_hi_u32 v11, v6, v11
+; CHECK-NEXT:    v_xor_b32_e32 v5, v5, v1
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v13, v5, v9
+; CHECK-NEXT:    v_mul_lo_u32 v13, v6, v9
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CHECK-NEXT:    v_mul_hi_u32 v12, v2, v9
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
@@ -62,18 +62,18 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CHECK-NEXT:    v_mul_hi_u32 v9, v5, v9
+; CHECK-NEXT:    v_mul_hi_u32 v9, v6, v9
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; CHECK-NEXT:    v_addc_u32_e64 v10, s[4:5], v5, v9, vcc
+; CHECK-NEXT:    v_addc_u32_e64 v10, s[4:5], v6, v9, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v8, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v10
 ; CHECK-NEXT:    v_mul_lo_u32 v12, v7, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v7, v2
-; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v9
+; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v9
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v10, v12
@@ -97,67 +97,67 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v9
-; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v4, v2
+; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v7, v5, v2
+; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v6
+; CHECK-NEXT:    v_mul_hi_u32 v9, v4, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v5, v2
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
+; CHECK-NEXT:    v_mul_lo_u32 v9, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v8, v4, v6
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
+; CHECK-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v2
-; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v5
-; CHECK-NEXT:    v_mul_lo_u32 v8, v1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v3, v8
-; CHECK-NEXT:    v_subb_u32_e64 v5, s[4:5], v4, v2, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v4, v2
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
+; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v6
+; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v3, v2
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
+; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v4, v8
+; CHECK-NEXT:    v_subb_u32_e64 v6, s[4:5], v5, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v5, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v0
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v0
 ; CHECK-NEXT:    v_subb_u32_e32 v2, vcc, v2, v0, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v3, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[4:5]
+; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v4, v3
 ; CHECK-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v2, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v1
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v0
 ; CHECK-NEXT:    v_subb_u32_e32 v0, vcc, v2, v0, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v7, v1
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v7, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[4:5]
 ; CHECK-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT:    v_xor_b32_e32 v2, v0, v6
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v1, v6
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v2, v6, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v2, v2, v1
+; CHECK-NEXT:    v_xor_b32_e32 v3, v0, v1
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v2, v1
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
 ; CHECK-NEXT:    ; implicit-def: $vgpr2
 ; CHECK-NEXT:    ; implicit-def: $vgpr4
 ; CHECK-NEXT:  BB0_2: ; %Flow
@@ -205,23 +205,23 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:  ; %bb.1:
 ; CHECK-NEXT:    s_ashr_i32 s6, s3, 31
 ; CHECK-NEXT:    s_ashr_i32 s0, s5, 31
-; CHECK-NEXT:    s_add_u32 s8, s2, s6
+; CHECK-NEXT:    s_add_u32 s10, s2, s6
 ; CHECK-NEXT:    s_cselect_b32 s7, 1, 0
 ; CHECK-NEXT:    s_and_b32 s7, s7, 1
 ; CHECK-NEXT:    s_cmp_lg_u32 s7, 0
-; CHECK-NEXT:    s_addc_u32 s9, s3, s6
-; CHECK-NEXT:    s_add_u32 s10, s4, s0
+; CHECK-NEXT:    s_addc_u32 s11, s3, s6
+; CHECK-NEXT:    s_add_u32 s8, s4, s0
 ; CHECK-NEXT:    s_cselect_b32 s3, 1, 0
 ; CHECK-NEXT:    s_and_b32 s3, s3, 1
 ; CHECK-NEXT:    s_cmp_lg_u32 s3, 0
 ; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    s_addc_u32 s11, s5, s0
-; CHECK-NEXT:    s_xor_b64 s[10:11], s[10:11], s[0:1]
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, s10
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, s11
+; CHECK-NEXT:    s_addc_u32 s9, s5, s0
+; CHECK-NEXT:    s_xor_b64 s[8:9], s[8:9], s[0:1]
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; CHECK-NEXT:    s_mov_b32 s7, s6
-; CHECK-NEXT:    s_xor_b64 s[8:9], s[8:9], s[6:7]
-; CHECK-NEXT:    s_sub_u32 s3, 0, s10
+; CHECK-NEXT:    s_xor_b64 s[10:11], s[10:11], s[6:7]
+; CHECK-NEXT:    s_sub_u32 s3, 0, s8
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
@@ -233,8 +233,8 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; CHECK-NEXT:    s_subb_u32 s5, 0, s11
-; CHECK-NEXT:    v_mov_b32_e32 v6, s11
+; CHECK-NEXT:    s_subb_u32 s5, 0, s9
+; CHECK-NEXT:    v_mov_b32_e32 v6, s9
 ; CHECK-NEXT:    v_mul_lo_u32 v3, s3, v1
 ; CHECK-NEXT:    v_mul_lo_u32 v2, s5, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v5, s3, v0
@@ -295,53 +295,53 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v2, s9, v0
-; CHECK-NEXT:    v_mul_lo_u32 v3, s8, v1
-; CHECK-NEXT:    v_mul_hi_u32 v5, s8, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, s9, v0
-; CHECK-NEXT:    v_mov_b32_e32 v4, s9
+; CHECK-NEXT:    v_mul_lo_u32 v2, s11, v0
+; CHECK-NEXT:    v_mul_lo_u32 v3, s10, v1
+; CHECK-NEXT:    v_mul_hi_u32 v5, s10, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, s11, v0
+; CHECK-NEXT:    v_mov_b32_e32 v4, s11
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v5, s9, v1
+; CHECK-NEXT:    v_mul_lo_u32 v5, s11, v1
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_mul_hi_u32 v3, s8, v1
+; CHECK-NEXT:    v_mul_hi_u32 v3, s10, v1
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; CHECK-NEXT:    v_mul_hi_u32 v1, s9, v1
+; CHECK-NEXT:    v_mul_hi_u32 v1, s11, v1
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT:    v_mul_lo_u32 v2, s11, v0
-; CHECK-NEXT:    v_mul_lo_u32 v1, s10, v1
-; CHECK-NEXT:    v_mul_lo_u32 v3, s10, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, s10, v0
+; CHECK-NEXT:    v_mul_lo_u32 v2, s9, v0
+; CHECK-NEXT:    v_mul_lo_u32 v1, s8, v1
+; CHECK-NEXT:    v_mul_lo_u32 v3, s8, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, s8, v0
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, s8, v3
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, s10, v3
 ; CHECK-NEXT:    v_subb_u32_e64 v2, s[0:1], v4, v0, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v0, s[0:1], s9, v0
-; CHECK-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v2
+; CHECK-NEXT:    v_sub_i32_e64 v0, s[0:1], s11, v0
+; CHECK-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
-; CHECK-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v1
+; CHECK-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v2
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v2
 ; CHECK-NEXT:    v_subb_u32_e32 v0, vcc, v0, v6, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[0:1]
-; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, s10, v1
+; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v1
 ; CHECK-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s11, v0
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s9, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v0
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v0
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; CHECK-NEXT:    v_subrev_i32_e32 v4, vcc, s10, v3
+; CHECK-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
@@ -398,40 +398,40 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v4, v4, v8
+; GISEL-NEXT:    v_xor_b32_e32 v9, v4, v8
 ; GISEL-NEXT:    v_xor_b32_e32 v5, v5, v8
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v4
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v5
-; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v9
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v9
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v10, v5
+; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v10
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v8, v8
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v10, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v4
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v9
 ; GISEL-NEXT:    v_mul_f32_e32 v8, 0x5f7ffffc, v8
-; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v8
-; GISEL-NEXT:    v_trunc_f32_e32 v9, v9
-; GISEL-NEXT:    v_mac_f32_e32 v8, 0xcf800000, v9
+; GISEL-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v8
+; GISEL-NEXT:    v_trunc_f32_e32 v10, v10
+; GISEL-NEXT:    v_mac_f32_e32 v8, 0xcf800000, v10
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v10
 ; GISEL-NEXT:    v_subb_u32_e32 v12, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v12, v8
-; GISEL-NEXT:    v_mul_lo_u32 v14, v11, v9
+; GISEL-NEXT:    v_mul_lo_u32 v14, v11, v10
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v11, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v8
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v10
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
-; GISEL-NEXT:    v_mul_lo_u32 v14, v9, v15
+; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v15
 ; GISEL-NEXT:    v_mul_lo_u32 v16, v8, v13
 ; GISEL-NEXT:    v_mul_hi_u32 v17, v8, v15
-; GISEL-NEXT:    v_mul_hi_u32 v15, v9, v15
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v10
+; GISEL-NEXT:    v_mul_hi_u32 v15, v10, v15
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v13
+; GISEL-NEXT:    v_mul_lo_u32 v17, v10, v13
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v8, v13
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
@@ -439,18 +439,18 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
-; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v10, v13
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v14
-; GISEL-NEXT:    v_addc_u32_e64 v14, s[4:5], v9, v13, vcc
+; GISEL-NEXT:    v_addc_u32_e64 v14, s[4:5], v10, v13, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v12, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v14
 ; GISEL-NEXT:    v_mul_lo_u32 v16, v11, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v11, v8
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v13
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v14, v16
@@ -474,207 +474,207 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v15, v13
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v11, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v10, v11, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v10, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v1, v8
-; GISEL-NEXT:    v_mul_lo_u32 v12, v0, v9
+; GISEL-NEXT:    v_mul_lo_u32 v12, v0, v10
 ; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v1, v8
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v1, v9
+; GISEL-NEXT:    v_mul_lo_u32 v13, v1, v10
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v9
+; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v10
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_hi_u32 v9, v1, v9
+; GISEL-NEXT:    v_mul_hi_u32 v10, v1, v10
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v8
-; GISEL-NEXT:    v_mul_lo_u32 v9, v4, v9
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v4, v8
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_mul_lo_u32 v10, v9, v10
+; GISEL-NEXT:    v_mul_lo_u32 v12, v9, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v8
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v12
-; GISEL-NEXT:    v_subb_u32_e64 v9, s[4:5], v1, v8, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], v1, v8, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v8
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v5
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v0, v4
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v0, v9
 ; GISEL-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v1, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v9
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v5
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v11, v4
+; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v11, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[4:5]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v11, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v11, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v4
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v7, v4, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v5, v5, v4
-; GISEL-NEXT:    v_xor_b32_e32 v4, v6, v4
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v5
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, v4
-; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, 0, v5
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v6
-; GISEL-NEXT:    v_trunc_f32_e32 v7, v7
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v5
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v5, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v6, v6, v5
+; GISEL-NEXT:    v_xor_b32_e32 v5, v7, v5
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, v6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v5
+; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v8
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v7
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v6
+; GISEL-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
+; GISEL-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v7
+; GISEL-NEXT:    v_trunc_f32_e32 v8, v8
+; GISEL-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v8
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v4, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v6
-; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v7
-; GISEL-NEXT:    v_mul_hi_u32 v15, v9, v6
-; GISEL-NEXT:    v_mul_lo_u32 v14, v9, v6
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v10
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v7
+; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v15, v10, v7
+; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v7
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v14
-; GISEL-NEXT:    v_mul_lo_u32 v15, v6, v12
-; GISEL-NEXT:    v_mul_hi_u32 v16, v6, v14
-; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v14
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v8
+; GISEL-NEXT:    v_mul_lo_u32 v13, v8, v14
+; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v12
+; GISEL-NEXT:    v_mul_hi_u32 v16, v7, v14
+; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v14
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v9
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v16, v7, v12
+; GISEL-NEXT:    v_mul_lo_u32 v16, v8, v12
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v12
+; GISEL-NEXT:    v_mul_hi_u32 v15, v7, v12
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
-; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v12
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
-; GISEL-NEXT:    v_addc_u32_e64 v13, s[4:5], v7, v12, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v11, v6
-; GISEL-NEXT:    v_mul_lo_u32 v14, v9, v13
-; GISEL-NEXT:    v_mul_lo_u32 v15, v9, v6
-; GISEL-NEXT:    v_mul_hi_u32 v9, v9, v6
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v12
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
+; GISEL-NEXT:    v_addc_u32_e64 v13, s[4:5], v8, v12, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v11, v11, v7
+; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v13
+; GISEL-NEXT:    v_mul_lo_u32 v15, v10, v7
+; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v7
+; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v12
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v13, v15
-; GISEL-NEXT:    v_mul_lo_u32 v14, v6, v9
-; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v15
+; GISEL-NEXT:    v_mul_lo_u32 v14, v7, v10
+; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v15
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v13, v15
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v8
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v9
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v12, v13, v9
+; GISEL-NEXT:    v_mul_lo_u32 v12, v13, v10
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v11
-; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v9
+; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v10
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
-; GISEL-NEXT:    v_mul_hi_u32 v9, v13, v9
+; GISEL-NEXT:    v_mul_hi_u32 v10, v13, v10
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v10
-; GISEL-NEXT:    v_mul_lo_u32 v9, v3, v6
-; GISEL-NEXT:    v_mul_lo_u32 v11, v2, v7
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v10, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v6
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v10, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v7
-; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v7
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
+; GISEL-NEXT:    v_mul_lo_u32 v11, v2, v8
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v4, v2, v7
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v8
+; GISEL-NEXT:    v_mul_hi_u32 v7, v3, v7
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
+; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v8
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT:    v_mul_hi_u32 v7, v3, v7
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT:    v_mul_lo_u32 v9, v4, v6
-; GISEL-NEXT:    v_mul_lo_u32 v7, v5, v7
-; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v5, v6
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_mul_hi_u32 v8, v3, v8
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v6, v7
+; GISEL-NEXT:    v_mul_lo_u32 v10, v6, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v6, v4
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_subb_u32_e64 v7, s[4:5], v3, v6, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v6
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v4
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v2, v5
+; GISEL-NEXT:    v_subb_u32_e64 v7, s[4:5], v3, v4, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v5
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v2, v6
 ; GISEL-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v5
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v6
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v4
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v9, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v5
+; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v8, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v8
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v8
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v9
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v9
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64:
@@ -695,35 +695,35 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v0
 ; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v5, v0, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v0
+; CGP-NEXT:    v_xor_b32_e32 v3, v1, v0
 ; CGP-NEXT:    v_xor_b32_e32 v0, v2, v0
-; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v1
-; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v0
-; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v11
-; CGP-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v3
+; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v0
+; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v11
+; CGP-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v4
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v4
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v11, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v1
+; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v11, v1, vcc
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CGP-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v2
 ; CGP-NEXT:    v_trunc_f32_e32 v10, v10
 ; CGP-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v10
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v10, v10
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
+; CGP-NEXT:    v_sub_i32_e32 v11, vcc, 0, v3
 ; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v0, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v13, v12, v2
 ; CGP-NEXT:    v_mul_lo_u32 v14, v11, v10
 ; CGP-NEXT:    v_mul_hi_u32 v16, v11, v2
 ; CGP-NEXT:    v_mul_lo_u32 v15, v11, v2
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v4
+; CGP-NEXT:    v_xor_b32_e32 v4, v4, v1
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; CGP-NEXT:    v_mul_lo_u32 v14, v10, v15
 ; CGP-NEXT:    v_mul_lo_u32 v16, v2, v13
 ; CGP-NEXT:    v_mul_hi_u32 v17, v2, v15
 ; CGP-NEXT:    v_mul_hi_u32 v15, v10, v15
-; CGP-NEXT:    v_xor_b32_e32 v5, v5, v4
+; CGP-NEXT:    v_xor_b32_e32 v5, v5, v1
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
@@ -775,8 +775,8 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
 ; CGP-NEXT:    v_addc_u32_e32 v10, vcc, 0, v10, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v11, v5, v2
-; CGP-NEXT:    v_mul_lo_u32 v12, v3, v10
-; CGP-NEXT:    v_mul_hi_u32 v13, v3, v2
+; CGP-NEXT:    v_mul_lo_u32 v12, v4, v10
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v2
 ; CGP-NEXT:    v_mul_hi_u32 v2, v5, v2
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
@@ -784,7 +784,7 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v13, v5, v10
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_mul_hi_u32 v12, v3, v10
+; CGP-NEXT:    v_mul_hi_u32 v12, v4, v10
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v13, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
@@ -796,42 +796,42 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; CGP-NEXT:    v_mul_lo_u32 v11, v0, v2
-; CGP-NEXT:    v_mul_lo_u32 v10, v1, v10
-; CGP-NEXT:    v_mul_lo_u32 v12, v1, v2
-; CGP-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CGP-NEXT:    v_mul_lo_u32 v10, v3, v10
+; CGP-NEXT:    v_mul_lo_u32 v12, v3, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v3, v2
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v3, v12
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v4, v12
 ; CGP-NEXT:    v_subb_u32_e64 v10, s[4:5], v5, v2, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v2, s[4:5], v5, v2
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v0
 ; CGP-NEXT:    v_subb_u32_e32 v2, vcc, v2, v0, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v3, v1
+; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v4, v3
 ; CGP-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v2, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v1
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v0
 ; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v2, v0, vcc
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v11, v1
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v11, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
-; CGP-NEXT:    v_xor_b32_e32 v2, v0, v4
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v2, v4, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v1
+; CGP-NEXT:    v_xor_b32_e32 v3, v0, v1
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v2, v1
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr4
 ; CGP-NEXT:    ; implicit-def: $vgpr10
 ; CGP-NEXT:  BB2_2: ; %Flow2
@@ -870,35 +870,35 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v6, v2
 ; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v7, v2, vcc
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v2
+; CGP-NEXT:    v_xor_b32_e32 v5, v3, v2
 ; CGP-NEXT:    v_xor_b32_e32 v2, v4, v2
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v3
-; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v2
-; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v9
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
+; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v5
+; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v2
+; CGP-NEXT:    v_ashrrev_i32_e32 v3, 31, v9
+; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v6
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v9, v6, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v3
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v9, v3, vcc
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v4
 ; CGP-NEXT:    v_trunc_f32_e32 v8, v8
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v8
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v3
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v5
 ; CGP-NEXT:    v_subb_u32_e32 v10, vcc, 0, v2, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v11, v10, v4
 ; CGP-NEXT:    v_mul_lo_u32 v12, v9, v8
 ; CGP-NEXT:    v_mul_hi_u32 v14, v9, v4
 ; CGP-NEXT:    v_mul_lo_u32 v13, v9, v4
-; CGP-NEXT:    v_xor_b32_e32 v5, v5, v6
+; CGP-NEXT:    v_xor_b32_e32 v6, v6, v3
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
 ; CGP-NEXT:    v_mul_lo_u32 v12, v8, v13
 ; CGP-NEXT:    v_mul_lo_u32 v14, v4, v11
 ; CGP-NEXT:    v_mul_hi_u32 v15, v4, v13
 ; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
-; CGP-NEXT:    v_xor_b32_e32 v7, v7, v6
+; CGP-NEXT:    v_xor_b32_e32 v7, v7, v3
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
@@ -950,8 +950,8 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v9, v7, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v5, v8
-; CGP-NEXT:    v_mul_hi_u32 v11, v5, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, v6, v8
+; CGP-NEXT:    v_mul_hi_u32 v11, v6, v4
 ; CGP-NEXT:    v_mul_hi_u32 v4, v7, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
@@ -959,7 +959,7 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v11, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_mul_hi_u32 v10, v5, v8
+; CGP-NEXT:    v_mul_hi_u32 v10, v6, v8
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
@@ -971,42 +971,42 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_mul_lo_u32 v9, v2, v4
-; CGP-NEXT:    v_mul_lo_u32 v8, v3, v8
-; CGP-NEXT:    v_mul_lo_u32 v10, v3, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
+; CGP-NEXT:    v_mul_lo_u32 v8, v5, v8
+; CGP-NEXT:    v_mul_lo_u32 v10, v5, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v5, v4
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v6, v10
 ; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v7, v4, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v7, v4
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v3
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v2
 ; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v2, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v5, v3
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v6, v5
 ; CGP-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v4, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v2
 ; CGP-NEXT:    v_subb_u32_e32 v2, vcc, v4, v2, vcc
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v9, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v6
-; CGP-NEXT:    v_xor_b32_e32 v4, v2, v6
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v3, v6
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v4, v6, vcc
+; CGP-NEXT:    v_xor_b32_e32 v4, v4, v3
+; CGP-NEXT:    v_xor_b32_e32 v5, v2, v3
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v4, v3
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v5, v3, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr6
 ; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  BB2_6: ; %Flow
@@ -1043,32 +1043,32 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-LABEL: v_srem_i64_pow2k_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x1000
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, 0x1000
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v4, 0
 ; CHECK-NEXT:    s_movk_i32 s6, 0xf000
-; CHECK-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v4
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v2
+; CHECK-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; CHECK-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; CHECK-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
+; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CHECK-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
-; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v2
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v4
-; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v2
+; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v3
+; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v3
 ; CHECK-NEXT:    s_bfe_i32 s7, -1, 0x10000
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
-; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v5
-; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v7
+; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v7
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
@@ -1076,7 +1076,7 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT:    v_mul_hi_u32 v8, v2, v5
+; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v5
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
@@ -1087,18 +1087,18 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CHECK-NEXT:    v_addc_u32_e64 v6, s[4:5], v4, v5, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, -1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, -1, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v8, s6, v6
-; CHECK-NEXT:    v_mul_hi_u32 v10, s6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v9, s6, v2
+; CHECK-NEXT:    v_mul_hi_u32 v10, s6, v3
+; CHECK-NEXT:    v_mul_lo_u32 v9, s6, v3
 ; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v5
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v8
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v10
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v9
-; CHECK-NEXT:    v_mul_lo_u32 v10, v2, v7
-; CHECK-NEXT:    v_mul_hi_u32 v5, v2, v9
+; CHECK-NEXT:    v_mul_lo_u32 v10, v3, v7
+; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v9
 ; CHECK-NEXT:    v_mul_hi_u32 v9, v6, v9
 ; CHECK-NEXT:    s_movk_i32 s6, 0x1000
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
@@ -1107,7 +1107,7 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v7
 ; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
-; CHECK-NEXT:    v_mul_hi_u32 v10, v2, v7
+; CHECK-NEXT:    v_mul_hi_u32 v10, v3, v7
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
@@ -1119,12 +1119,12 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v9, v8
 ; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
 ; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
@@ -1132,31 +1132,31 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v6, v0, v4
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CHECK-NEXT:    v_mul_lo_u32 v5, 0, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, 0, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v4, s6, v4
-; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, s6, v2
+; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, s6, v3
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v1, v2, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v2
+; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v1, v3, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v3
 ; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_mov_b32_e32 v5, s7
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
 ; CHECK-NEXT:    v_subrev_i32_e32 v5, vcc, s6, v0
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CHECK-NEXT:    s_bfe_i32 s4, -1, 0x10000
@@ -1170,13 +1170,13 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; CHECK-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v2
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v2
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %num, 4096
   ret i64 %result
@@ -1195,73 +1195,73 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_mov_b32 s7, s6
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s8
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s9
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s8
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s9
 ; GISEL-NEXT:    s_sub_u32 s11, 0, s8
 ; GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GISEL-NEXT:    s_and_b32 s4, s4, 1
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GISEL-NEXT:    s_subb_u32 s12, 0, s9
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
+; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_mul_lo_u32 v7, s12, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, s11, v5
-; GISEL-NEXT:    v_mul_hi_u32 v10, s11, v4
-; GISEL-NEXT:    v_mul_lo_u32 v9, s11, v4
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, s12, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s11, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, s11, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, s11, v5
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v9
-; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v6
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v5, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, s12, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v6, v7, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v9, s12, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v10, s11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, s11, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, s11, v4
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v12, s11, v5
+; GISEL-NEXT:    v_mul_lo_u32 v11, s11, v5
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v11
+; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
-; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
+; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
@@ -1272,47 +1272,47 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v1, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v10, v0, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v1, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, v0, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, v0, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GISEL-NEXT:    v_mov_b32_e32 v9, s9
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v5
+; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v6
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v8, v0, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; GISEL-NEXT:    v_mul_hi_u32 v8, v0, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v6, v1, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_mul_lo_u32 v7, s9, v4
-; GISEL-NEXT:    v_mul_lo_u32 v5, s8, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, s8, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_mul_lo_u32 v7, s9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v6, s8, v6
+; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, s8, v5
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v1, v4, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v4
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s9, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e64 v6, s[4:5], v1, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v5
+; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s9, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s9, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s9, v6
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[4:5]
 ; GISEL-NEXT:    v_subrev_i32_e32 v7, vcc, s8, v0
 ; GISEL-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v1, vcc
 ; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s9, v8
@@ -1332,81 +1332,81 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s7
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s7
 ; GISEL-NEXT:    s_sub_u32 s8, 0, s6
 ; GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GISEL-NEXT:    s_and_b32 s4, s4, 1
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
 ; GISEL-NEXT:    s_subb_u32 s9, 0, s7
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
+; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v6
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT:    v_mul_lo_u32 v7, s9, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v5
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_mul_hi_u32 v10, s8, v4
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
-; GISEL-NEXT:    v_mul_lo_u32 v9, s8, v4
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, s9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v6
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_mul_hi_u32 v10, s8, v5
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GISEL-NEXT:    v_mul_lo_u32 v9, s8, v5
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v9
-; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v5, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, s9, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v6, v7, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v9, s9, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v10, s8, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, s8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, s8, v4
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v12, s8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v11, s8, v5
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v11
+; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v6
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
-; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
+; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
@@ -1417,47 +1417,47 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v2, v5
-; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, v2, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
 ; GISEL-NEXT:    v_mov_b32_e32 v9, s7
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v5
+; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v6
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v8, v2, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; GISEL-NEXT:    v_mul_hi_u32 v8, v2, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_mul_lo_u32 v7, s7, v4
-; GISEL-NEXT:    v_mul_lo_u32 v5, s6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, s6, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, s6, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_mul_lo_u32 v7, s7, v5
+; GISEL-NEXT:    v_mul_lo_u32 v6, s6, v6
+; GISEL-NEXT:    v_mul_lo_u32 v8, s6, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, s6, v5
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v3, v4, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v4
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s7, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e64 v6, s[4:5], v3, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v5
+; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s7, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s7, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s7, v6
 ; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[4:5]
 ; GISEL-NEXT:    v_subrev_i32_e32 v7, vcc, s6, v2
 ; GISEL-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v3, vcc
 ; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s7, v8
@@ -1472,35 +1472,35 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; GISEL-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v6
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v4
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_pow2k_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, 0x1000
+; CGP-NEXT:    v_cvt_f32_u32_e32 v5, 0x1000
 ; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
 ; CGP-NEXT:    s_movk_i32 s6, 0xf000
-; CGP-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
-; CGP-NEXT:    v_mov_b32_e32 v7, v4
+; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; CGP-NEXT:    v_mov_b32_e32 v7, v5
 ; CGP-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v6
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; CGP-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
 ; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v7
 ; CGP-NEXT:    v_trunc_f32_e32 v8, v8
 ; CGP-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v8
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v5
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v9, -1, v7
 ; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
 ; CGP-NEXT:    v_mul_hi_u32 v12, s6, v7
@@ -1543,7 +1543,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v14, v7, v11
 ; CGP-NEXT:    v_mul_hi_u32 v9, v7, v13
 ; CGP-NEXT:    v_mul_hi_u32 v13, v10, v13
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
+; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
@@ -1568,7 +1568,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v10, v0, v8
 ; CGP-NEXT:    v_mul_hi_u32 v11, v0, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v1, v7
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
@@ -1590,7 +1590,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v8, s7, v8
 ; CGP-NEXT:    v_mul_lo_u32 v10, s7, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, s7, v7
-; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
@@ -1616,34 +1616,34 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; CGP-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v4
+; CGP-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v5
 ; CGP-NEXT:    v_trunc_f32_e32 v7, v7
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
-; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v7
+; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v8, -1, v4
+; CGP-NEXT:    v_mul_lo_u32 v8, -1, v5
 ; CGP-NEXT:    v_mul_lo_u32 v9, s6, v7
-; CGP-NEXT:    v_mul_hi_u32 v11, s6, v4
+; CGP-NEXT:    v_mul_hi_u32 v11, s6, v5
 ; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
-; CGP-NEXT:    v_mul_lo_u32 v10, s6, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, s6, v5
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; CGP-NEXT:    v_mul_lo_u32 v9, v7, v10
-; CGP-NEXT:    v_mul_lo_u32 v11, v4, v8
-; CGP-NEXT:    v_mul_hi_u32 v12, v4, v10
+; CGP-NEXT:    v_mul_lo_u32 v11, v5, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, v5, v10
 ; CGP-NEXT:    v_mul_hi_u32 v10, v7, v10
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v5
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v12, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; CGP-NEXT:    v_mul_hi_u32 v11, v4, v8
+; CGP-NEXT:    v_mul_hi_u32 v11, v5, v8
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
@@ -1654,18 +1654,18 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; CGP-NEXT:    v_addc_u32_e64 v9, s[4:5], v7, v8, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, -1, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, -1, v5
 ; CGP-NEXT:    v_mul_lo_u32 v11, s6, v9
-; CGP-NEXT:    v_mul_hi_u32 v13, s6, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, s6, v4
+; CGP-NEXT:    v_mul_hi_u32 v13, s6, v5
+; CGP-NEXT:    v_mul_lo_u32 v12, s6, v5
 ; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v8
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
 ; CGP-NEXT:    v_mul_lo_u32 v11, v9, v12
-; CGP-NEXT:    v_mul_lo_u32 v13, v4, v10
-; CGP-NEXT:    v_mul_hi_u32 v8, v4, v12
+; CGP-NEXT:    v_mul_lo_u32 v13, v5, v10
+; CGP-NEXT:    v_mul_hi_u32 v8, v5, v12
 ; CGP-NEXT:    v_mul_hi_u32 v12, v9, v12
 ; CGP-NEXT:    v_xor_b32_e32 v2, v2, v6
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
@@ -1674,7 +1674,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; CGP-NEXT:    v_mul_lo_u32 v11, v9, v10
 ; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v13, v8
-; CGP-NEXT:    v_mul_hi_u32 v13, v4, v10
+; CGP-NEXT:    v_mul_hi_u32 v13, v5, v10
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
@@ -1686,30 +1686,30 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v11
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
 ; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v6
 ; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
-; CGP-NEXT:    v_mul_lo_u32 v8, v3, v4
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
+; CGP-NEXT:    v_mul_lo_u32 v8, v3, v5
 ; CGP-NEXT:    v_mul_lo_u32 v9, v2, v7
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; CGP-NEXT:    v_mul_hi_u32 v5, v2, v4
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; CGP-NEXT:    v_mul_hi_u32 v4, v2, v5
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v8, v3, v7
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v3, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
 ; CGP-NEXT:    v_mul_hi_u32 v9, v2, v7
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
@@ -1758,32 +1758,32 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-LABEL: v_srem_i64_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x12d8fb
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, 0x12d8fb
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s6, 0xffed2705
-; CHECK-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v4
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v2
+; CHECK-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; CHECK-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; CHECK-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
+; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CHECK-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
-; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v2
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v4
-; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v2
+; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v3
+; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v3
 ; CHECK-NEXT:    s_bfe_i32 s7, -1, 0x10000
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
-; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v5
-; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v7
+; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v7
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
@@ -1791,7 +1791,7 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT:    v_mul_hi_u32 v8, v2, v5
+; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v5
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
@@ -1802,18 +1802,18 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CHECK-NEXT:    v_addc_u32_e64 v6, s[4:5], v4, v5, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, -1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, -1, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v8, s6, v6
-; CHECK-NEXT:    v_mul_hi_u32 v10, s6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v9, s6, v2
+; CHECK-NEXT:    v_mul_hi_u32 v10, s6, v3
+; CHECK-NEXT:    v_mul_lo_u32 v9, s6, v3
 ; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v5
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v8
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v10
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v9
-; CHECK-NEXT:    v_mul_lo_u32 v10, v2, v7
-; CHECK-NEXT:    v_mul_hi_u32 v5, v2, v9
+; CHECK-NEXT:    v_mul_lo_u32 v10, v3, v7
+; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v9
 ; CHECK-NEXT:    v_mul_hi_u32 v9, v6, v9
 ; CHECK-NEXT:    s_mov_b32 s6, 0x12d8fb
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
@@ -1822,7 +1822,7 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v7
 ; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
-; CHECK-NEXT:    v_mul_hi_u32 v10, v2, v7
+; CHECK-NEXT:    v_mul_hi_u32 v10, v3, v7
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
@@ -1834,12 +1834,12 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v9, v8
 ; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
 ; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
@@ -1847,31 +1847,31 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v6, v0, v4
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CHECK-NEXT:    v_mul_lo_u32 v5, 0, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, 0, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v4, s6, v4
-; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, s6, v2
+; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, s6, v3
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v1, v2, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v2
+; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v1, v3, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v3
 ; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_mov_b32_e32 v5, s7
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
 ; CHECK-NEXT:    v_subrev_i32_e32 v5, vcc, s6, v0
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CHECK-NEXT:    s_bfe_i32 s4, -1, 0x10000
@@ -1885,13 +1885,13 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; CHECK-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v2
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v2
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %num, 1235195
   ret i64 %result
@@ -1910,73 +1910,73 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_mov_b32 s7, s6
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s8
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s9
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s8
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s9
 ; GISEL-NEXT:    s_sub_u32 s11, 0, s8
 ; GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GISEL-NEXT:    s_and_b32 s4, s4, 1
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GISEL-NEXT:    s_subb_u32 s12, 0, s9
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
+; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_mul_lo_u32 v7, s12, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, s11, v5
-; GISEL-NEXT:    v_mul_hi_u32 v10, s11, v4
-; GISEL-NEXT:    v_mul_lo_u32 v9, s11, v4
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, s12, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s11, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, s11, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, s11, v5
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v9
-; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v6
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v5, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, s12, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v6, v7, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v9, s12, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v10, s11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, s11, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, s11, v4
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v12, s11, v5
+; GISEL-NEXT:    v_mul_lo_u32 v11, s11, v5
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v11
+; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
-; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
+; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
@@ -1987,47 +1987,47 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v1, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v10, v0, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v1, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, v0, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, v0, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GISEL-NEXT:    v_mov_b32_e32 v9, s9
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v5
+; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v6
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v8, v0, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; GISEL-NEXT:    v_mul_hi_u32 v8, v0, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v6, v1, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_mul_lo_u32 v7, s9, v4
-; GISEL-NEXT:    v_mul_lo_u32 v5, s8, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, s8, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_mul_lo_u32 v7, s9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v6, s8, v6
+; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, s8, v5
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v1, v4, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v4
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s9, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e64 v6, s[4:5], v1, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v5
+; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s9, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s9, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s9, v6
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[4:5]
 ; GISEL-NEXT:    v_subrev_i32_e32 v7, vcc, s8, v0
 ; GISEL-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v1, vcc
 ; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s9, v8
@@ -2047,81 +2047,81 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s7
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s7
 ; GISEL-NEXT:    s_sub_u32 s8, 0, s6
 ; GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GISEL-NEXT:    s_and_b32 s4, s4, 1
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
 ; GISEL-NEXT:    s_subb_u32 s9, 0, s7
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
+; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v6
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT:    v_mul_lo_u32 v7, s9, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v5
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_mul_hi_u32 v10, s8, v4
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
-; GISEL-NEXT:    v_mul_lo_u32 v9, s8, v4
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, s9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v6
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_mul_hi_u32 v10, s8, v5
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GISEL-NEXT:    v_mul_lo_u32 v9, s8, v5
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v9
-; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v5, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, s9, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v6, v7, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v9, s9, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v10, s8, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, s8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, s8, v4
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v12, s8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v11, s8, v5
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v11
+; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v6
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
-; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
+; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
@@ -2132,47 +2132,47 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v2, v5
-; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, v2, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
 ; GISEL-NEXT:    v_mov_b32_e32 v9, s7
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v5
+; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v6
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v8, v2, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; GISEL-NEXT:    v_mul_hi_u32 v8, v2, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_mul_lo_u32 v7, s7, v4
-; GISEL-NEXT:    v_mul_lo_u32 v5, s6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, s6, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, s6, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_mul_lo_u32 v7, s7, v5
+; GISEL-NEXT:    v_mul_lo_u32 v6, s6, v6
+; GISEL-NEXT:    v_mul_lo_u32 v8, s6, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, s6, v5
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v3, v4, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v4
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s7, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e64 v6, s[4:5], v3, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v5
+; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s7, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s7, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s7, v6
 ; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[4:5]
 ; GISEL-NEXT:    v_subrev_i32_e32 v7, vcc, s6, v2
 ; GISEL-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v3, vcc
 ; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s7, v8
@@ -2187,35 +2187,35 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; GISEL-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v6
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v4
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_oddk_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
+; CGP-NEXT:    v_cvt_f32_u32_e32 v5, 0x12d8fb
 ; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
 ; CGP-NEXT:    s_mov_b32 s6, 0xffed2705
-; CGP-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
-; CGP-NEXT:    v_mov_b32_e32 v7, v4
+; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; CGP-NEXT:    v_mov_b32_e32 v7, v5
 ; CGP-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v6
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; CGP-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
 ; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v7
 ; CGP-NEXT:    v_trunc_f32_e32 v8, v8
 ; CGP-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v8
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v5
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v9, -1, v7
 ; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
 ; CGP-NEXT:    v_mul_hi_u32 v12, s6, v7
@@ -2258,7 +2258,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v14, v7, v11
 ; CGP-NEXT:    v_mul_hi_u32 v9, v7, v13
 ; CGP-NEXT:    v_mul_hi_u32 v13, v10, v13
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
+; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
@@ -2283,7 +2283,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v10, v0, v8
 ; CGP-NEXT:    v_mul_hi_u32 v11, v0, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v1, v7
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
@@ -2305,7 +2305,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v8, s7, v8
 ; CGP-NEXT:    v_mul_lo_u32 v10, s7, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, s7, v7
-; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
@@ -2331,34 +2331,34 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; CGP-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v4
+; CGP-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v5
 ; CGP-NEXT:    v_trunc_f32_e32 v7, v7
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
-; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v7
+; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v8, -1, v4
+; CGP-NEXT:    v_mul_lo_u32 v8, -1, v5
 ; CGP-NEXT:    v_mul_lo_u32 v9, s6, v7
-; CGP-NEXT:    v_mul_hi_u32 v11, s6, v4
+; CGP-NEXT:    v_mul_hi_u32 v11, s6, v5
 ; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
-; CGP-NEXT:    v_mul_lo_u32 v10, s6, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, s6, v5
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; CGP-NEXT:    v_mul_lo_u32 v9, v7, v10
-; CGP-NEXT:    v_mul_lo_u32 v11, v4, v8
-; CGP-NEXT:    v_mul_hi_u32 v12, v4, v10
+; CGP-NEXT:    v_mul_lo_u32 v11, v5, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, v5, v10
 ; CGP-NEXT:    v_mul_hi_u32 v10, v7, v10
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v5
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v12, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; CGP-NEXT:    v_mul_hi_u32 v11, v4, v8
+; CGP-NEXT:    v_mul_hi_u32 v11, v5, v8
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
@@ -2369,18 +2369,18 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; CGP-NEXT:    v_addc_u32_e64 v9, s[4:5], v7, v8, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, -1, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, -1, v5
 ; CGP-NEXT:    v_mul_lo_u32 v11, s6, v9
-; CGP-NEXT:    v_mul_hi_u32 v13, s6, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, s6, v4
+; CGP-NEXT:    v_mul_hi_u32 v13, s6, v5
+; CGP-NEXT:    v_mul_lo_u32 v12, s6, v5
 ; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v8
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
 ; CGP-NEXT:    v_mul_lo_u32 v11, v9, v12
-; CGP-NEXT:    v_mul_lo_u32 v13, v4, v10
-; CGP-NEXT:    v_mul_hi_u32 v8, v4, v12
+; CGP-NEXT:    v_mul_lo_u32 v13, v5, v10
+; CGP-NEXT:    v_mul_hi_u32 v8, v5, v12
 ; CGP-NEXT:    v_mul_hi_u32 v12, v9, v12
 ; CGP-NEXT:    v_xor_b32_e32 v2, v2, v6
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
@@ -2389,7 +2389,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; CGP-NEXT:    v_mul_lo_u32 v11, v9, v10
 ; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v13, v8
-; CGP-NEXT:    v_mul_hi_u32 v13, v4, v10
+; CGP-NEXT:    v_mul_hi_u32 v13, v5, v10
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
@@ -2401,30 +2401,30 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v11
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
 ; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v6
 ; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
-; CGP-NEXT:    v_mul_lo_u32 v8, v3, v4
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
+; CGP-NEXT:    v_mul_lo_u32 v8, v3, v5
 ; CGP-NEXT:    v_mul_lo_u32 v9, v2, v7
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; CGP-NEXT:    v_mul_hi_u32 v5, v2, v4
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; CGP-NEXT:    v_mul_hi_u32 v4, v2, v5
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v8, v3, v7
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v3, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
 ; CGP-NEXT:    v_mul_hi_u32 v9, v2, v7
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
@@ -2488,40 +2488,40 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v0, 31, v6
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v5, v0
 ; CHECK-NEXT:    v_addc_u32_e32 v2, vcc, v6, v0, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v0
+; CHECK-NEXT:    v_xor_b32_e32 v5, v1, v0
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v2, v0
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v1
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v5, v0
-; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v5
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v5
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v0
+; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v1
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v6
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, 0, v1
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, 0, v5
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; CHECK-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v2
-; CHECK-NEXT:    v_trunc_f32_e32 v5, v5
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v5
+; CHECK-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v2
+; CHECK-NEXT:    v_trunc_f32_e32 v6, v6
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v6
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; CHECK-NEXT:    v_subb_u32_e32 v8, vcc, 0, v0, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v8, v2
-; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v5
+; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v12, v7, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v2
-; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v6
+; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v1
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; CHECK-NEXT:    v_mul_lo_u32 v10, v5, v11
+; CHECK-NEXT:    v_mul_lo_u32 v10, v6, v11
 ; CHECK-NEXT:    v_mul_lo_u32 v12, v2, v9
 ; CHECK-NEXT:    v_mul_hi_u32 v13, v2, v11
-; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v11
-; CHECK-NEXT:    v_xor_b32_e32 v4, v4, v6
+; CHECK-NEXT:    v_mul_hi_u32 v11, v6, v11
+; CHECK-NEXT:    v_xor_b32_e32 v4, v4, v1
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v13, v5, v9
+; CHECK-NEXT:    v_mul_lo_u32 v13, v6, v9
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CHECK-NEXT:    v_mul_hi_u32 v12, v2, v9
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
@@ -2529,18 +2529,18 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CHECK-NEXT:    v_mul_hi_u32 v9, v5, v9
+; CHECK-NEXT:    v_mul_hi_u32 v9, v6, v9
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; CHECK-NEXT:    v_addc_u32_e64 v10, s[4:5], v5, v9, vcc
+; CHECK-NEXT:    v_addc_u32_e64 v10, s[4:5], v6, v9, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v8, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v10
 ; CHECK-NEXT:    v_mul_lo_u32 v12, v7, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v7, v2
-; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v9
+; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v9
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v10, v12
@@ -2564,67 +2564,67 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v9
-; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v5
+; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v2, v4, v2
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
+; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v6
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v6
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
+; CHECK-NEXT:    v_mul_hi_u32 v6, v4, v6
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v2
-; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v5
-; CHECK-NEXT:    v_mul_lo_u32 v8, v1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT:    v_mul_lo_u32 v6, v5, v6
+; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v5, v2
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v3, v8
-; CHECK-NEXT:    v_subb_u32_e64 v5, s[4:5], v4, v2, vcc
+; CHECK-NEXT:    v_subb_u32_e64 v6, s[4:5], v4, v2, vcc
 ; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v4, v2
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v0
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v0
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v0
 ; CHECK-NEXT:    v_subb_u32_e32 v2, vcc, v2, v0, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v3, v1
+; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v3, v5
 ; CHECK-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v2, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v1
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v0
 ; CHECK-NEXT:    v_subb_u32_e32 v0, vcc, v2, v0, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v7, v1
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v7, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[4:5]
 ; CHECK-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT:    v_xor_b32_e32 v2, v0, v6
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v1, v6
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v2, v6, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v2, v2, v1
+; CHECK-NEXT:    v_xor_b32_e32 v3, v0, v1
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v2, v1
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
 ; CHECK-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; CHECK-NEXT:    ; implicit-def: $vgpr3
 ; CHECK-NEXT:  BB7_2: ; %Flow
@@ -2664,32 +2664,32 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b64 s[6:7], 0x1000
 ; GISEL-NEXT:    v_lshl_b64 v[4:5], s[6:7], v4
-; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v4, v4, v7
+; GISEL-NEXT:    v_xor_b32_e32 v8, v4, v7
 ; GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, v4
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v5
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v8
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, v8
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v5
+; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v9
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; GISEL-NEXT:    v_xor_b32_e32 v8, v0, v9
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v4
+; GISEL-NEXT:    v_xor_b32_e32 v9, v0, v4
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v7
 ; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v0
 ; GISEL-NEXT:    v_trunc_f32_e32 v7, v7
 ; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v7
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v8
 ; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v10, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v0
-; GISEL-NEXT:    v_xor_b32_e32 v16, v1, v9
+; GISEL-NEXT:    v_xor_b32_e32 v16, v1, v4
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v14
@@ -2747,9 +2747,9 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v0, v11
 ; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v16, v7
-; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v10
+; GISEL-NEXT:    v_mul_lo_u32 v12, v9, v10
 ; GISEL-NEXT:    v_lshl_b64 v[0:1], s[6:7], v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v7
+; GISEL-NEXT:    v_mul_hi_u32 v6, v9, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v7, v16, v7
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
@@ -2757,7 +2757,7 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v16, v10
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
-; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v10
+; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v10
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
@@ -2769,66 +2769,66 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v6
-; GISEL-NEXT:    v_mul_lo_u32 v7, v4, v7
-; GISEL-NEXT:    v_mul_lo_u32 v11, v4, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v4, v6
+; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v7
+; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v6
+; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v6
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v8, v11
-; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], v16, v6, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v9, v11
+; GISEL-NEXT:    v_subb_u32_e64 v9, s[4:5], v16, v6, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v6, s[4:5], v16, v6
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v5
 ; GISEL-NEXT:    v_subb_u32_e32 v6, vcc, v6, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v7, v4
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v7, v8
 ; GISEL-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v6, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v5
 ; GISEL-NEXT:    v_subb_u32_e32 v5, vcc, v6, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v11, v4
+; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v11, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[4:5]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v11, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v5, v12, v5, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v7, v0, v6
-; GISEL-NEXT:    v_xor_b32_e32 v6, v1, v6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v0, v7
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, v6
-; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
-; GISEL-NEXT:    v_xor_b32_e32 v4, v4, v9
-; GISEL-NEXT:    v_xor_b32_e32 v5, v5, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v8, v0, v7
+; GISEL-NEXT:    v_xor_b32_e32 v7, v1, v7
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v0, v8
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, v7
+; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
+; GISEL-NEXT:    v_xor_b32_e32 v6, v6, v4
+; GISEL-NEXT:    v_xor_b32_e32 v5, v5, v4
 ; GISEL-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v8
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v3, v8, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v9
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v3, v9, vcc
 ; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GISEL-NEXT:    v_xor_b32_e32 v3, v1, v8
+; GISEL-NEXT:    v_xor_b32_e32 v3, v1, v9
 ; GISEL-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GISEL-NEXT:    v_trunc_f32_e32 v1, v1
 ; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v7
-; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v8
+; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v7, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v1
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v10, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v0
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v8
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v9
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v1, v14
@@ -2887,64 +2887,64 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, 0, v1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v2, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v3, v11
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v4, v9
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v6, v4
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v5, v4, vcc
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v10
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v5, v9, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v12, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v5, v2, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v10
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
-; GISEL-NEXT:    v_mul_hi_u32 v9, v3, v11
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v11
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v11
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
-; GISEL-NEXT:    v_mul_lo_u32 v9, v6, v4
-; GISEL-NEXT:    v_mul_lo_u32 v5, v7, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v7, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v6, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v5, v8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v8, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v3, v10
 ; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v2, v4, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v4
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v6
-; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v3, v7
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v7
+; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v2, v7, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v3, v8
 ; GISEL-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v2, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v7
-; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v8
+; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v2, v7, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v6
-; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v9, v7
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v7
+; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v6, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v8
-; GISEL-NEXT:    v_xor_b32_e32 v4, v2, v8
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v3, v8
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v4, v8, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v9
+; GISEL-NEXT:    v_xor_b32_e32 v4, v2, v9
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v3, v9
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v4, v9, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_pow2_shl_denom:
@@ -2968,35 +2968,35 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v2, v0
 ; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v3, v0, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v0
+; CGP-NEXT:    v_xor_b32_e32 v3, v1, v0
 ; CGP-NEXT:    v_xor_b32_e32 v0, v2, v0
-; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v1
-; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v0
-; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v9
-; CGP-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v3
+; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v0
+; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v9
+; CGP-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v4
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v8, v4
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v9, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v1
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v9, v1, vcc
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v2
 ; CGP-NEXT:    v_trunc_f32_e32 v8, v8
 ; CGP-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v8
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v1
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v3
 ; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v0, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v13, v12, v2
 ; CGP-NEXT:    v_mul_lo_u32 v14, v9, v8
 ; CGP-NEXT:    v_mul_hi_u32 v16, v9, v2
 ; CGP-NEXT:    v_mul_lo_u32 v15, v9, v2
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v4
+; CGP-NEXT:    v_xor_b32_e32 v4, v4, v1
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; CGP-NEXT:    v_mul_lo_u32 v14, v8, v15
 ; CGP-NEXT:    v_mul_lo_u32 v16, v2, v13
 ; CGP-NEXT:    v_mul_hi_u32 v17, v2, v15
 ; CGP-NEXT:    v_mul_hi_u32 v15, v8, v15
-; CGP-NEXT:    v_xor_b32_e32 v6, v6, v4
+; CGP-NEXT:    v_xor_b32_e32 v6, v6, v1
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
@@ -3048,8 +3048,8 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
 ; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v9, v6, v2
-; CGP-NEXT:    v_mul_lo_u32 v12, v3, v8
-; CGP-NEXT:    v_mul_hi_u32 v13, v3, v2
+; CGP-NEXT:    v_mul_lo_u32 v12, v4, v8
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v2
 ; CGP-NEXT:    v_mul_hi_u32 v2, v6, v2
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
@@ -3057,7 +3057,7 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v13, v6, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
-; CGP-NEXT:    v_mul_hi_u32 v12, v3, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, v4, v8
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v13, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
@@ -3069,42 +3069,42 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_mul_lo_u32 v9, v0, v2
-; CGP-NEXT:    v_mul_lo_u32 v8, v1, v8
-; CGP-NEXT:    v_mul_lo_u32 v12, v1, v2
-; CGP-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CGP-NEXT:    v_mul_lo_u32 v8, v3, v8
+; CGP-NEXT:    v_mul_lo_u32 v12, v3, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v3, v2
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v3, v12
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v4, v12
 ; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v6, v2, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v2, s[4:5], v6, v2
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v0
 ; CGP-NEXT:    v_subb_u32_e32 v2, vcc, v2, v0, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v3, v1
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v4, v3
 ; CGP-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v2, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v1
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v0
 ; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v2, v0, vcc
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v9, v1
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v9, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
-; CGP-NEXT:    v_xor_b32_e32 v2, v0, v4
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v2, v4, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v1
+; CGP-NEXT:    v_xor_b32_e32 v3, v0, v1
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v2, v1
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  BB8_2: ; %Flow2
@@ -3143,40 +3143,40 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v11
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v2
 ; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v11, v2, vcc
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v2
+; CGP-NEXT:    v_xor_b32_e32 v6, v3, v2
 ; CGP-NEXT:    v_xor_b32_e32 v2, v4, v2
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v3
-; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v2
-; CGP-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
+; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v6
+; CGP-NEXT:    v_cvt_f32_u32_e32 v8, v2
+; CGP-NEXT:    v_ashrrev_i32_e32 v3, 31, v7
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v3
+; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v8
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v7, v8, vcc
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v3
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v6
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; CGP-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v4
-; CGP-NEXT:    v_trunc_f32_e32 v7, v7
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
+; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v4
+; CGP-NEXT:    v_trunc_f32_e32 v8, v8
+; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v8
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; CGP-NEXT:    v_subb_u32_e32 v10, vcc, 0, v2, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v11, v10, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, v9, v7
+; CGP-NEXT:    v_mul_lo_u32 v12, v9, v8
 ; CGP-NEXT:    v_mul_hi_u32 v14, v9, v4
 ; CGP-NEXT:    v_mul_lo_u32 v13, v9, v4
-; CGP-NEXT:    v_xor_b32_e32 v5, v5, v8
+; CGP-NEXT:    v_xor_b32_e32 v5, v5, v3
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; CGP-NEXT:    v_mul_lo_u32 v12, v7, v13
+; CGP-NEXT:    v_mul_lo_u32 v12, v8, v13
 ; CGP-NEXT:    v_mul_lo_u32 v14, v4, v11
 ; CGP-NEXT:    v_mul_hi_u32 v15, v4, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v7, v13
-; CGP-NEXT:    v_xor_b32_e32 v6, v6, v8
+; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
+; CGP-NEXT:    v_xor_b32_e32 v7, v7, v3
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v15, v7, v11
+; CGP-NEXT:    v_mul_lo_u32 v15, v8, v11
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
 ; CGP-NEXT:    v_mul_hi_u32 v14, v4, v11
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
@@ -3184,18 +3184,18 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_mul_hi_u32 v11, v7, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v8, v11
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_addc_u32_e64 v12, s[4:5], v7, v11, vcc
+; CGP-NEXT:    v_addc_u32_e64 v12, s[4:5], v8, v11, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v10, v10, v4
 ; CGP-NEXT:    v_mul_lo_u32 v13, v9, v12
 ; CGP-NEXT:    v_mul_lo_u32 v14, v9, v4
 ; CGP-NEXT:    v_mul_hi_u32 v9, v9, v4
-; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v11
+; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v9
 ; CGP-NEXT:    v_mul_lo_u32 v10, v12, v14
@@ -3219,67 +3219,67 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v11
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v6, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v5, v7
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, v7, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, v5, v8
 ; CGP-NEXT:    v_mul_hi_u32 v11, v5, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v6, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v7, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, v6, v7
+; CGP-NEXT:    v_mul_lo_u32 v11, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_mul_hi_u32 v10, v5, v7
+; CGP-NEXT:    v_mul_hi_u32 v10, v5, v8
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
+; CGP-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_mul_lo_u32 v9, v2, v4
-; CGP-NEXT:    v_mul_lo_u32 v7, v3, v7
-; CGP-NEXT:    v_mul_lo_u32 v10, v3, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CGP-NEXT:    v_mul_lo_u32 v8, v6, v8
+; CGP-NEXT:    v_mul_lo_u32 v10, v6, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v6, v4
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v10
-; CGP-NEXT:    v_subb_u32_e64 v7, s[4:5], v6, v4, vcc
-; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v6, v4
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v3
+; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v7, v4, vcc
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v7, v4
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v2
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v2
 ; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v2, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v5, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v5, v6
 ; CGP-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v4, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v2
 ; CGP-NEXT:    v_subb_u32_e32 v2, vcc, v4, v2, vcc
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v9, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v8
-; CGP-NEXT:    v_xor_b32_e32 v4, v2, v8
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v3, v8
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v4, v8, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; CGP-NEXT:    v_xor_b32_e32 v4, v4, v3
+; CGP-NEXT:    v_xor_b32_e32 v5, v2, v3
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v4, v3
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v5, v3, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr10_vgpr11
 ; CGP-NEXT:    ; implicit-def: $vgpr5
 ; CGP-NEXT:  BB8_6: ; %Flow
@@ -3381,56 +3381,56 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_subb_u32_e32 v8, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT:    v_and_b32_e32 v0, s6, v0
+; GISEL-NEXT:    v_and_b32_e32 v5, s6, v0
+; GISEL-NEXT:    v_and_b32_e32 v0, s6, v2
 ; GISEL-NEXT:    v_and_b32_e32 v6, s6, v6
-; GISEL-NEXT:    v_and_b32_e32 v2, s6, v2
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
+; GISEL-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v4
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v2
+; GISEL-NEXT:    v_trunc_f32_e32 v4, v4
+; GISEL-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_mul_lo_u32 v9, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v9, v8, v2
+; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v4
+; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v2
+; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v2
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v14, v4, v11
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, 0, v0
+; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v11
+; GISEL-NEXT:    v_mul_lo_u32 v12, v2, v9
+; GISEL-NEXT:    v_mul_hi_u32 v14, v2, v11
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 0, v5
 ; GISEL-NEXT:    v_addc_u32_e64 v13, s[4:5], 0, 0, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v14, v5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v11
+; GISEL-NEXT:    v_mul_lo_u32 v14, v4, v9
+; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v11
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
+; GISEL-NEXT:    v_mul_hi_u32 v12, v2, v9
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
-; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v4, v9
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT:    v_addc_u32_e64 v10, s[4:5], v5, v9, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v8, v4
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; GISEL-NEXT:    v_addc_u32_e64 v10, s[4:5], v4, v9, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v8, v8, v2
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v10
-; GISEL-NEXT:    v_mul_lo_u32 v12, v7, v4
-; GISEL-NEXT:    v_mul_hi_u32 v7, v7, v4
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v9
+; GISEL-NEXT:    v_mul_lo_u32 v12, v7, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v7, v2
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v9
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v10, v12
-; GISEL-NEXT:    v_mul_lo_u32 v11, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v9, v4, v12
+; GISEL-NEXT:    v_mul_lo_u32 v11, v2, v7
+; GISEL-NEXT:    v_mul_hi_u32 v9, v2, v12
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v10, v12
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
@@ -3438,7 +3438,7 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v7
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v7
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
@@ -3449,197 +3449,197 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v9
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v9, v0, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v4
+; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v4, v7, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v2
+; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v4
+; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v2
+; GISEL-NEXT:    v_mul_hi_u32 v2, v13, v2
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v13, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, v13, v4
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v8, v0, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; GISEL-NEXT:    v_mul_hi_u32 v8, v5, v4
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v5, v13, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v4
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v4
-; GISEL-NEXT:    v_mul_lo_u32 v5, v1, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, v1, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v13, v4, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v4, s[4:5], v13, v4
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v2
+; GISEL-NEXT:    v_mul_lo_u32 v4, v1, v4
+; GISEL-NEXT:    v_mul_lo_u32 v8, v1, v2
+; GISEL-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v5, v8
+; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v13, v2, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v13, v2
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v1
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v3
-; GISEL-NEXT:    v_subb_u32_e32 v4, vcc, v4, v3, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v2, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v0, v1
-; GISEL-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v4, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v4, v1
+; GISEL-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v2, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
+; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v2, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v1
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v8, v1
+; GISEL-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v3
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v4, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, 0, v6
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
 ; GISEL-NEXT:    v_addc_u32_e64 v6, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v4
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v3
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v12, v6
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v11, 0x4f800000, v12
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v8, v11
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GISEL-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v8
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v8
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
-; GISEL-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, 0, v4
+; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
 ; GISEL-NEXT:    v_subb_u32_e32 v8, vcc, 0, v6, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v8, v3
+; GISEL-NEXT:    v_mul_lo_u32 v9, v8, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v3
-; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v3
+; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v4
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v3, v9
-; GISEL-NEXT:    v_mul_hi_u32 v14, v3, v11
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0, v2
-; GISEL-NEXT:    v_addc_u32_e64 v13, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v9
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 0, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v4, v11
+; GISEL-NEXT:    v_addc_u32_e64 v14, s[4:5], 0, 0, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v14, v5, v9
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v11
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; GISEL-NEXT:    v_mul_hi_u32 v12, v3, v9
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v10
-; GISEL-NEXT:    v_addc_u32_e64 v10, s[4:5], v5, v9, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v8, v3
-; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v10
-; GISEL-NEXT:    v_mul_lo_u32 v12, v7, v3
-; GISEL-NEXT:    v_mul_hi_u32 v7, v7, v3
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
+; GISEL-NEXT:    v_addc_u32_e64 v4, s[4:5], v5, v9, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v8, v8, v0
+; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v0
+; GISEL-NEXT:    v_mul_hi_u32 v7, v7, v0
 ; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v9
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
+; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
-; GISEL-NEXT:    v_mul_lo_u32 v8, v10, v12
-; GISEL-NEXT:    v_mul_lo_u32 v11, v3, v7
-; GISEL-NEXT:    v_mul_hi_u32 v9, v3, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v10, v12
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v8, v4, v11
+; GISEL-NEXT:    v_mul_lo_u32 v10, v0, v7
+; GISEL-NEXT:    v_mul_hi_u32 v9, v0, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v11
+; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v7
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v11, v3, v7
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v9, v4, v7
+; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v10, v0, v7
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v7
+; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v4, v4, v7
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v9
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v3
-; GISEL-NEXT:    v_mul_lo_u32 v8, v2, v5
-; GISEL-NEXT:    v_mul_hi_u32 v9, v2, v3
-; GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0, v0
-; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v9
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v7
+; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v5, v4, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v0, v8
+; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v14, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, v13, v4
+; GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0, v1
+; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v2, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v2, v13, v5
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v14, v4
+; GISEL-NEXT:    v_mul_hi_u32 v5, v14, v5
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
+; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v13, v5
-; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v3
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v8, v2, v5
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v5, v13, v5
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_mul_lo_u32 v7, v6, v3
-; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, v4, v3
-; GISEL-NEXT:    v_mul_hi_u32 v3, v4, v3
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_mul_hi_u32 v4, v14, v4
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v13, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v13, v3
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_mul_lo_u32 v5, v6, v2
+; GISEL-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v2
+; GISEL-NEXT:    v_mul_hi_u32 v2, v3, v2
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v13, v7
+; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v14, v2, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v14, v2
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v6
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v2, v4
-; GISEL-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v4, v3
+; GISEL-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v2, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v4
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v3
+; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v6
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v8, v4
+; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v8, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
-; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 0, v2
-; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
+; GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 0, v3
+; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v4, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_24bit:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index a7efed6808256..2c95c717e34b6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -2470,29 +2470,29 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b64 s[4:5], 0x1000
-; GISEL-NEXT:    v_lshl_b64 v[4:5], s[4:5], v4
-; GISEL-NEXT:    v_lshl_b64 v[6:7], s[4:5], v6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v4
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v5
-; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v9
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v8, v8
-; GISEL-NEXT:    v_mul_f32_e32 v8, 0x5f7ffffc, v8
-; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v8
+; GISEL-NEXT:    v_lshl_b64 v[7:8], s[4:5], v4
+; GISEL-NEXT:    v_lshl_b64 v[4:5], s[4:5], v6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v7
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v8
+; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v9
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
+; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v6
 ; GISEL-NEXT:    v_trunc_f32_e32 v9, v9
-; GISEL-NEXT:    v_mac_f32_e32 v8, 0xcf800000, v9
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v9
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v4
-; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v8
-; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v8
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v7
+; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v6
+; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v15, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v15, v10, v6
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v15, v8, v13
-; GISEL-NEXT:    v_mul_hi_u32 v16, v8, v12
+; GISEL-NEXT:    v_mul_lo_u32 v15, v6, v13
+; GISEL-NEXT:    v_mul_hi_u32 v16, v6, v12
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
@@ -2500,7 +2500,7 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v9, v13
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
-; GISEL-NEXT:    v_mul_hi_u32 v16, v8, v13
+; GISEL-NEXT:    v_mul_hi_u32 v16, v6, v13
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
@@ -2511,18 +2511,18 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
 ; GISEL-NEXT:    v_addc_u32_e64 v12, s[4:5], v9, v13, vcc
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v13
-; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v8
-; GISEL-NEXT:    v_mul_lo_u32 v11, v11, v8
+; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v6
+; GISEL-NEXT:    v_mul_lo_u32 v11, v11, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v12
-; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v6
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v12, v13
-; GISEL-NEXT:    v_mul_lo_u32 v14, v8, v10
-; GISEL-NEXT:    v_mul_hi_u32 v15, v8, v13
+; GISEL-NEXT:    v_mul_lo_u32 v14, v6, v10
+; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v13
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v15
@@ -2530,7 +2530,7 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v12, v10
 ; GISEL-NEXT:    v_mul_hi_u32 v13, v12, v13
-; GISEL-NEXT:    v_mul_hi_u32 v15, v8, v10
+; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v10
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
@@ -2541,93 +2541,93 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v10
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v11
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v10, vcc
 ; GISEL-NEXT:    v_addc_u32_e64 v9, vcc, 0, v9, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v8
+; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v0, v9
-; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v8
+; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v6
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v1, v9
-; GISEL-NEXT:    v_mul_hi_u32 v8, v1, v8
+; GISEL-NEXT:    v_mul_hi_u32 v6, v1, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v9
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v1, v9
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v8
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v8
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v13, v4, v8
+; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v6
+; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v6
+; GISEL-NEXT:    v_mul_lo_u32 v12, v7, v9
+; GISEL-NEXT:    v_mul_hi_u32 v13, v7, v6
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
 ; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], v1, v11, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v11
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, v11, v12, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v4
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v7
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 1, v8
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 1, v6
 ; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, 0, v9, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v8
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v13, v0, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v11
-; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, 0, v12, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v12, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v11, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v12, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v12, v7, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v7
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, 0, v6
-; GISEL-NEXT:    v_subb_u32_e32 v9, vcc, 0, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v4
-; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v4
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v4
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, v5
+; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
+; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v6
+; GISEL-NEXT:    v_trunc_f32_e32 v7, v7
+; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v7
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, 0, v4
+; GISEL-NEXT:    v_subb_u32_e32 v9, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v6
+; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v6
+; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v7
+; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v6
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v10
-; GISEL-NEXT:    v_mul_lo_u32 v13, v4, v11
-; GISEL-NEXT:    v_mul_hi_u32 v14, v4, v10
+; GISEL-NEXT:    v_mul_lo_u32 v12, v7, v10
+; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v11
+; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v10
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_lo_u32 v13, v5, v11
-; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v10
-; GISEL-NEXT:    v_mul_hi_u32 v14, v4, v11
+; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v11
+; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v10
+; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v11
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
@@ -2636,20 +2636,20 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v11
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT:    v_addc_u32_e64 v10, s[4:5], v5, v11, vcc
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v11
-; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT:    v_addc_u32_e64 v10, s[4:5], v7, v11, vcc
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v11
+; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v6
+; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v10
-; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v4
+; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v6
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v8
-; GISEL-NEXT:    v_mul_hi_u32 v13, v4, v11
+; GISEL-NEXT:    v_mul_lo_u32 v12, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v13, v6, v11
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v13
@@ -2657,7 +2657,7 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v10, v11
-; GISEL-NEXT:    v_mul_hi_u32 v13, v4, v8
+; GISEL-NEXT:    v_mul_hi_u32 v13, v6, v8
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
@@ -2668,64 +2668,64 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v10, v8
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v9
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
-; GISEL-NEXT:    v_addc_u32_e64 v5, vcc, 0, v5, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v8, v3, v4
-; GISEL-NEXT:    v_mul_lo_u32 v9, v2, v5
-; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v4
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v9
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v8, vcc
+; GISEL-NEXT:    v_addc_u32_e64 v7, vcc, 0, v7, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v8, v3, v6
+; GISEL-NEXT:    v_mul_lo_u32 v9, v2, v7
+; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v6
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_lo_u32 v9, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; GISEL-NEXT:    v_mul_lo_u32 v9, v3, v7
+; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v7
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v4
-; GISEL-NEXT:    v_mul_lo_u32 v9, v7, v4
-; GISEL-NEXT:    v_mul_lo_u32 v10, v6, v5
-; GISEL-NEXT:    v_mul_hi_u32 v11, v6, v4
+; GISEL-NEXT:    v_mul_hi_u32 v7, v3, v7
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_mul_lo_u32 v8, v4, v6
+; GISEL-NEXT:    v_mul_lo_u32 v9, v5, v6
+; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v6
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
 ; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], v3, v9, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v9
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v7
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v7
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, v9, v10, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v6
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v4
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v5, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v4
-; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v6
+; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v7, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v9
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v10, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, 0, v10, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v9, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v4, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_udiv_v2i64_pow2_shl_denom:
@@ -3131,61 +3131,61 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s6, 0xffffff
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v1, 0
-; GISEL-NEXT:    v_and_b32_e32 v3, s6, v4
-; GISEL-NEXT:    v_and_b32_e32 v4, s6, v6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v3
-; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, 0, v3
-; GISEL-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v4
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, 0, v4
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v7, 0
+; GISEL-NEXT:    v_and_b32_e32 v1, s6, v4
+; GISEL-NEXT:    v_and_b32_e32 v3, s6, v6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v1
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, 0, v1
+; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v3
+; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, 0, v3
 ; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v1
-; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v1
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v5
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v8
-; GISEL-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; GISEL-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v1
-; GISEL-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v5
+; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
+; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v7
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v8
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
+; GISEL-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
+; GISEL-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v6
+; GISEL-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v7
 ; GISEL-NEXT:    v_trunc_f32_e32 v8, v8
 ; GISEL-NEXT:    v_trunc_f32_e32 v11, v11
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v8
+; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v8
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v11
+; GISEL-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v11
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v11
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GISEL-NEXT:    v_mul_lo_u32 v12, v6, v8
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v8
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v11
-; GISEL-NEXT:    v_mul_lo_u32 v14, v6, v1
-; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v1
-; GISEL-NEXT:    v_mul_hi_u32 v16, v6, v1
-; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v5
-; GISEL-NEXT:    v_mul_lo_u32 v18, v10, v5
-; GISEL-NEXT:    v_mul_hi_u32 v19, v9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v14, v4, v6
+; GISEL-NEXT:    v_mul_lo_u32 v15, v5, v6
+; GISEL-NEXT:    v_mul_hi_u32 v16, v4, v6
+; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v7
+; GISEL-NEXT:    v_mul_lo_u32 v18, v10, v7
+; GISEL-NEXT:    v_mul_hi_u32 v19, v9, v7
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v18, v13
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v17
-; GISEL-NEXT:    v_mul_hi_u32 v18, v5, v17
+; GISEL-NEXT:    v_mul_hi_u32 v18, v7, v17
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v19
-; GISEL-NEXT:    v_mul_lo_u32 v19, v5, v13
+; GISEL-NEXT:    v_mul_lo_u32 v19, v7, v13
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v19
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v18
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v8, v14
-; GISEL-NEXT:    v_mul_hi_u32 v18, v1, v14
+; GISEL-NEXT:    v_mul_hi_u32 v18, v6, v14
 ; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v14
 ; GISEL-NEXT:    v_mul_hi_u32 v17, v11, v17
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v16
-; GISEL-NEXT:    v_mul_lo_u32 v16, v1, v12
+; GISEL-NEXT:    v_mul_lo_u32 v16, v6, v12
 ; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v18
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v8, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v16, s[4:5], v16, v18
-; GISEL-NEXT:    v_mul_hi_u32 v18, v1, v12
+; GISEL-NEXT:    v_mul_hi_u32 v18, v6, v12
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v18
@@ -3195,7 +3195,7 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
 ; GISEL-NEXT:    v_mul_lo_u32 v19, v11, v13
 ; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v19, v17
-; GISEL-NEXT:    v_mul_hi_u32 v19, v5, v13
+; GISEL-NEXT:    v_mul_hi_u32 v19, v7, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v19
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
@@ -3216,35 +3216,35 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v19, v18
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v14
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
 ; GISEL-NEXT:    v_addc_u32_e64 v14, s[4:5], v8, v12, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v15, v6, v1
-; GISEL-NEXT:    v_mul_lo_u32 v7, v7, v1
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v17
+; GISEL-NEXT:    v_mul_lo_u32 v15, v4, v6
+; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v6
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v17
 ; GISEL-NEXT:    v_addc_u32_e64 v16, s[6:7], v11, v13, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, v10, v5
-; GISEL-NEXT:    v_mul_hi_u32 v18, v9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v7
+; GISEL-NEXT:    v_mul_lo_u32 v10, v10, v7
+; GISEL-NEXT:    v_mul_hi_u32 v18, v9, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v16
 ; GISEL-NEXT:    v_mul_lo_u32 v19, v16, v17
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v17
+; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v17
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v9, v18
-; GISEL-NEXT:    v_mul_lo_u32 v18, v5, v9
+; GISEL-NEXT:    v_mul_lo_u32 v18, v7, v9
 ; GISEL-NEXT:    v_add_i32_e64 v18, s[6:7], v19, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[6:7]
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[6:7], v18, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v6, v1
-; GISEL-NEXT:    v_mul_lo_u32 v6, v6, v14
+; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v6
+; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v14
 ; GISEL-NEXT:    v_mul_lo_u32 v18, v14, v15
-; GISEL-NEXT:    v_add_i32_e64 v6, s[8:9], v7, v6
-; GISEL-NEXT:    v_mul_hi_u32 v7, v1, v15
-; GISEL-NEXT:    v_add_i32_e64 v6, s[8:9], v6, v10
-; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v6
+; GISEL-NEXT:    v_add_i32_e64 v4, s[8:9], v5, v4
+; GISEL-NEXT:    v_mul_hi_u32 v5, v6, v15
+; GISEL-NEXT:    v_add_i32_e64 v4, s[8:9], v4, v10
+; GISEL-NEXT:    v_mul_lo_u32 v10, v6, v4
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[8:9], v18, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[8:9]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[8:9], v10, v7
-; GISEL-NEXT:    v_mov_b32_e32 v7, s10
+; GISEL-NEXT:    v_add_i32_e64 v5, s[8:9], v10, v5
+; GISEL-NEXT:    v_mov_b32_e32 v5, s10
 ; GISEL-NEXT:    v_mov_b32_e32 v10, s11
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[10:11], v8, v12
 ; GISEL-NEXT:    v_mov_b32_e32 v12, s12
@@ -3253,26 +3253,26 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v16, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[8:9]
 ; GISEL-NEXT:    v_add_i32_e64 v17, s[8:9], v18, v17
-; GISEL-NEXT:    v_mul_lo_u32 v18, v14, v6
-; GISEL-NEXT:    v_mul_hi_u32 v14, v14, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v1, v6
+; GISEL-NEXT:    v_mul_lo_u32 v18, v14, v4
+; GISEL-NEXT:    v_mul_hi_u32 v14, v14, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v6, v4
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[8:9], v18, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[8:9]
-; GISEL-NEXT:    v_add_i32_e64 v6, s[8:9], v13, v6
+; GISEL-NEXT:    v_add_i32_e64 v4, s[8:9], v13, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[8:9]
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[8:9], v18, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[6:7]
 ; GISEL-NEXT:    v_add_i32_e64 v18, s[6:7], v19, v18
 ; GISEL-NEXT:    v_mul_lo_u32 v19, v16, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v16, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v9
 ; GISEL-NEXT:    v_add_i32_e64 v15, s[6:7], v19, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[6:7]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v15, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[6:7]
 ; GISEL-NEXT:    v_add_i32_e64 v15, s[6:7], v19, v15
 ; GISEL-NEXT:    v_mov_b32_e32 v19, s13
-; GISEL-NEXT:    v_add_i32_e64 v6, s[6:7], v6, v17
+; GISEL-NEXT:    v_add_i32_e64 v4, s[6:7], v4, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[6:7]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v9, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[6:7]
@@ -3282,16 +3282,16 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[6:7], v16, v15
 ; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v13, vcc
 ; GISEL-NEXT:    v_addc_u32_e64 v11, vcc, v11, v14, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
 ; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, 0, v1
-; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v1
-; GISEL-NEXT:    v_mul_hi_u32 v1, 0, v1
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_mul_lo_u32 v8, 0, v4
+; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, 0, v4
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v11, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, 0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v14, v2, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, 0, v5
+; GISEL-NEXT:    v_mul_lo_u32 v11, 0, v7
+; GISEL-NEXT:    v_mul_hi_u32 v14, v2, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, 0, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v0, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v16, 0, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v17, v0, v6
@@ -3305,54 +3305,54 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v9, 0, v9
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v16, v1
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v16, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v11, v5
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v11, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v17
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v18, v17
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v15
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
-; GISEL-NEXT:    v_mul_lo_u32 v13, v3, v1
-; GISEL-NEXT:    v_mul_lo_u32 v15, 0, v1
-; GISEL-NEXT:    v_mul_hi_u32 v16, v3, v1
+; GISEL-NEXT:    v_mul_lo_u32 v13, v1, v4
+; GISEL-NEXT:    v_mul_lo_u32 v15, 0, v4
+; GISEL-NEXT:    v_mul_hi_u32 v16, v1, v4
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; GISEL-NEXT:    v_mul_lo_u32 v14, v4, v5
-; GISEL-NEXT:    v_mul_lo_u32 v17, 0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v18, v4, v5
+; GISEL-NEXT:    v_mul_lo_u32 v14, v3, v7
+; GISEL-NEXT:    v_mul_lo_u32 v17, 0, v7
+; GISEL-NEXT:    v_mul_hi_u32 v18, v3, v7
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v11
-; GISEL-NEXT:    v_mul_lo_u32 v9, v3, v6
-; GISEL-NEXT:    v_mul_lo_u32 v11, v4, v8
+; GISEL-NEXT:    v_mul_lo_u32 v9, v1, v6
+; GISEL-NEXT:    v_mul_lo_u32 v11, v3, v8
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v15, v9
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v17, v11
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, 1, v1
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, 1, v4
 ; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v6, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v16
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v18
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v13
 ; GISEL-NEXT:    v_subb_u32_e64 v13, s[4:5], 0, v9, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v3
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v13
-; GISEL-NEXT:    v_add_i32_e64 v13, s[6:7], 1, v5
+; GISEL-NEXT:    v_add_i32_e64 v13, s[6:7], 1, v7
 ; GISEL-NEXT:    v_addc_u32_e64 v18, s[6:7], 0, v8, s[6:7]
 ; GISEL-NEXT:    v_sub_i32_e64 v2, s[6:7], v2, v14
 ; GISEL-NEXT:    v_subb_u32_e64 v14, s[8:9], 0, v11, s[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, v7, v16, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v16, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, v12, v16, s[4:5]
@@ -3360,34 +3360,34 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_addc_u32_e64 v16, s[4:5], 0, v17, s[4:5]
 ; GISEL-NEXT:    v_sub_i32_e64 v11, s[4:5], 0, v11
 ; GISEL-NEXT:    v_subbrev_u32_e64 v11, s[4:5], 0, v11, s[6:7]
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v4
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v3
 ; GISEL-NEXT:    v_subbrev_u32_e64 v11, s[4:5], 0, v11, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v3
 ; GISEL-NEXT:    v_add_i32_e64 v2, s[6:7], 1, v13
-; GISEL-NEXT:    v_addc_u32_e64 v4, s[6:7], 0, v18, s[6:7]
+; GISEL-NEXT:    v_addc_u32_e64 v3, s[6:7], 0, v18, s[6:7]
 ; GISEL-NEXT:    v_sub_i32_e64 v9, s[6:7], 0, v9
 ; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v11
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v19, v1, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v15, v14, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, v13, v2, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v17, v16, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v18, v4, s[4:5]
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v13, v2, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v17, v16, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v18, v3, s[4:5]
 ; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v8, v4, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v7, v1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v9, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_udiv_v2i64_24bit:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index cddd490c32f89..f1d76a90b74a4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -1827,29 +1827,29 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b64 s[4:5], 0x1000
-; GISEL-NEXT:    v_lshl_b64 v[4:5], s[4:5], v4
-; GISEL-NEXT:    v_lshl_b64 v[6:7], s[4:5], v6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v4
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v5
-; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v9
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v8, v8
-; GISEL-NEXT:    v_mul_f32_e32 v8, 0x5f7ffffc, v8
-; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v8
+; GISEL-NEXT:    v_lshl_b64 v[7:8], s[4:5], v4
+; GISEL-NEXT:    v_lshl_b64 v[4:5], s[4:5], v6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v7
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v8
+; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v9
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
+; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v6
 ; GISEL-NEXT:    v_trunc_f32_e32 v9, v9
-; GISEL-NEXT:    v_mac_f32_e32 v8, 0xcf800000, v9
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v9
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v4
-; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v8
-; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v8
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v7
+; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v6
+; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v15, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v15, v10, v6
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v15, v8, v13
-; GISEL-NEXT:    v_mul_hi_u32 v16, v8, v12
+; GISEL-NEXT:    v_mul_lo_u32 v15, v6, v13
+; GISEL-NEXT:    v_mul_hi_u32 v16, v6, v12
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
@@ -1857,7 +1857,7 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v9, v13
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
-; GISEL-NEXT:    v_mul_hi_u32 v16, v8, v13
+; GISEL-NEXT:    v_mul_hi_u32 v16, v6, v13
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
@@ -1868,18 +1868,18 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
 ; GISEL-NEXT:    v_addc_u32_e64 v12, s[4:5], v9, v13, vcc
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v13
-; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v8
-; GISEL-NEXT:    v_mul_lo_u32 v11, v11, v8
+; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v6
+; GISEL-NEXT:    v_mul_lo_u32 v11, v11, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v12
-; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v6
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v12, v13
-; GISEL-NEXT:    v_mul_lo_u32 v14, v8, v10
-; GISEL-NEXT:    v_mul_hi_u32 v15, v8, v13
+; GISEL-NEXT:    v_mul_lo_u32 v14, v6, v10
+; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v13
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v15
@@ -1887,7 +1887,7 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v12, v10
 ; GISEL-NEXT:    v_mul_hi_u32 v13, v12, v13
-; GISEL-NEXT:    v_mul_hi_u32 v15, v8, v10
+; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v10
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
@@ -1898,92 +1898,92 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v10
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v11
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v10, vcc
 ; GISEL-NEXT:    v_addc_u32_e64 v9, vcc, 0, v9, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v8
+; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v0, v9
-; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v8
+; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v6
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v1, v9
-; GISEL-NEXT:    v_mul_hi_u32 v8, v1, v8
+; GISEL-NEXT:    v_mul_hi_u32 v6, v1, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v9
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v1, v9
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v8
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v8
-; GISEL-NEXT:    v_mul_lo_u32 v9, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v8, v4, v8
+; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v6
+; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v6
+; GISEL-NEXT:    v_mul_lo_u32 v9, v7, v9
+; GISEL-NEXT:    v_mul_hi_u32 v6, v7, v6
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_subb_u32_e64 v9, s[4:5], v1, v8, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v8
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
+; GISEL-NEXT:    v_subb_u32_e64 v9, s[4:5], v1, v6, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v10, s[4:5], v0, v4
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, v10, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v10, s[4:5], v0, v7
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e64 v11, vcc, 0, v1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v11, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v11, v8
 ; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v10, v4
-; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v1, v5, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v10, v7
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v1, v8, s[4:5]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v7
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, 0, v6
-; GISEL-NEXT:    v_subb_u32_e32 v9, vcc, 0, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v4
-; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v4
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v4
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, v5
+; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
+; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v6
+; GISEL-NEXT:    v_trunc_f32_e32 v7, v7
+; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v7
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, 0, v4
+; GISEL-NEXT:    v_subb_u32_e32 v9, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v6
+; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v6
+; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v7
+; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v6
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v10
-; GISEL-NEXT:    v_mul_lo_u32 v13, v4, v11
-; GISEL-NEXT:    v_mul_hi_u32 v14, v4, v10
+; GISEL-NEXT:    v_mul_lo_u32 v12, v7, v10
+; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v11
+; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v10
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_lo_u32 v13, v5, v11
-; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v10
-; GISEL-NEXT:    v_mul_hi_u32 v14, v4, v11
+; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v11
+; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v10
+; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v11
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
@@ -1992,20 +1992,20 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v11
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT:    v_addc_u32_e64 v10, s[4:5], v5, v11, vcc
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v11
-; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT:    v_addc_u32_e64 v10, s[4:5], v7, v11, vcc
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v11
+; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v6
+; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v10
-; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v4
+; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v6
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v8
-; GISEL-NEXT:    v_mul_hi_u32 v13, v4, v11
+; GISEL-NEXT:    v_mul_lo_u32 v12, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v13, v6, v11
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v13
@@ -2013,7 +2013,7 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v10, v11
-; GISEL-NEXT:    v_mul_hi_u32 v13, v4, v8
+; GISEL-NEXT:    v_mul_hi_u32 v13, v6, v8
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
@@ -2024,63 +2024,63 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v10, v8
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v9
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
-; GISEL-NEXT:    v_addc_u32_e64 v5, vcc, 0, v5, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v8, v3, v4
-; GISEL-NEXT:    v_mul_lo_u32 v9, v2, v5
-; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v4
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v9
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v8, vcc
+; GISEL-NEXT:    v_addc_u32_e64 v7, vcc, 0, v7, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v8, v3, v6
+; GISEL-NEXT:    v_mul_lo_u32 v9, v2, v7
+; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v6
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_lo_u32 v9, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; GISEL-NEXT:    v_mul_lo_u32 v9, v3, v7
+; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v7
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v4
-; GISEL-NEXT:    v_mul_lo_u32 v9, v7, v4
-; GISEL-NEXT:    v_mul_lo_u32 v5, v6, v5
-; GISEL-NEXT:    v_mul_hi_u32 v4, v6, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GISEL-NEXT:    v_mul_hi_u32 v7, v3, v7
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_mul_lo_u32 v8, v4, v6
+; GISEL-NEXT:    v_mul_lo_u32 v9, v5, v6
+; GISEL-NEXT:    v_mul_lo_u32 v7, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v6, v4, v6
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v3, v4, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v4
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
+; GISEL-NEXT:    v_subb_u32_e64 v7, s[4:5], v3, v6, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v8, s[4:5], v2, v6
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, v8, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v8, s[4:5], v2, v4
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v5, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e64 v9, vcc, 0, v3, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v7
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v7
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v8, v6
-; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v3, v7, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v8, v4
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v3, v5, s[4:5]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_urem_v2i64_pow2_shl_denom:
@@ -2480,61 +2480,61 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s6, 0xffffff
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v1, 0
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v7, 0
 ; GISEL-NEXT:    v_and_b32_e32 v3, s6, v4
-; GISEL-NEXT:    v_and_b32_e32 v4, s6, v6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v3
-; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, 0, v3
-; GISEL-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v4
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, 0, v4
+; GISEL-NEXT:    v_and_b32_e32 v1, s6, v6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v3
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
+; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v1
+; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, 0, v1
 ; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v1
-; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v1
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v5
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v8
-; GISEL-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; GISEL-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v1
-; GISEL-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v5
+; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
+; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v7
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v8
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
+; GISEL-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
+; GISEL-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v6
+; GISEL-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v7
 ; GISEL-NEXT:    v_trunc_f32_e32 v8, v8
 ; GISEL-NEXT:    v_trunc_f32_e32 v11, v11
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v8
+; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v8
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v11
+; GISEL-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v11
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v11
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GISEL-NEXT:    v_mul_lo_u32 v12, v6, v8
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v8
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v11
-; GISEL-NEXT:    v_mul_lo_u32 v14, v6, v1
-; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v1
-; GISEL-NEXT:    v_mul_hi_u32 v16, v6, v1
-; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v5
-; GISEL-NEXT:    v_mul_lo_u32 v18, v10, v5
-; GISEL-NEXT:    v_mul_hi_u32 v19, v9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v14, v4, v6
+; GISEL-NEXT:    v_mul_lo_u32 v15, v5, v6
+; GISEL-NEXT:    v_mul_hi_u32 v16, v4, v6
+; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v7
+; GISEL-NEXT:    v_mul_lo_u32 v18, v10, v7
+; GISEL-NEXT:    v_mul_hi_u32 v19, v9, v7
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v18, v13
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v17
-; GISEL-NEXT:    v_mul_hi_u32 v18, v5, v17
+; GISEL-NEXT:    v_mul_hi_u32 v18, v7, v17
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v19
-; GISEL-NEXT:    v_mul_lo_u32 v19, v5, v13
+; GISEL-NEXT:    v_mul_lo_u32 v19, v7, v13
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v19
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v18
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v8, v14
-; GISEL-NEXT:    v_mul_hi_u32 v18, v1, v14
+; GISEL-NEXT:    v_mul_hi_u32 v18, v6, v14
 ; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v14
 ; GISEL-NEXT:    v_mul_hi_u32 v17, v11, v17
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v16
-; GISEL-NEXT:    v_mul_lo_u32 v16, v1, v12
+; GISEL-NEXT:    v_mul_lo_u32 v16, v6, v12
 ; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v18
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v8, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v16, s[4:5], v16, v18
-; GISEL-NEXT:    v_mul_hi_u32 v18, v1, v12
+; GISEL-NEXT:    v_mul_hi_u32 v18, v6, v12
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v18
@@ -2544,7 +2544,7 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
 ; GISEL-NEXT:    v_mul_lo_u32 v19, v11, v13
 ; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v19, v17
-; GISEL-NEXT:    v_mul_hi_u32 v19, v5, v13
+; GISEL-NEXT:    v_mul_hi_u32 v19, v7, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v19
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
@@ -2565,35 +2565,35 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v19, v18
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v14
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
 ; GISEL-NEXT:    v_addc_u32_e64 v14, s[4:5], v8, v12, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v15, v6, v1
-; GISEL-NEXT:    v_mul_lo_u32 v7, v7, v1
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v17
+; GISEL-NEXT:    v_mul_lo_u32 v15, v4, v6
+; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v6
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v17
 ; GISEL-NEXT:    v_addc_u32_e64 v16, s[6:7], v11, v13, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, v10, v5
-; GISEL-NEXT:    v_mul_hi_u32 v18, v9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v7
+; GISEL-NEXT:    v_mul_lo_u32 v10, v10, v7
+; GISEL-NEXT:    v_mul_hi_u32 v18, v9, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v16
 ; GISEL-NEXT:    v_mul_lo_u32 v19, v16, v17
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v17
+; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v17
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v9, v18
-; GISEL-NEXT:    v_mul_lo_u32 v18, v5, v9
+; GISEL-NEXT:    v_mul_lo_u32 v18, v7, v9
 ; GISEL-NEXT:    v_add_i32_e64 v18, s[6:7], v19, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[6:7]
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[6:7], v18, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v6, v1
-; GISEL-NEXT:    v_mul_lo_u32 v6, v6, v14
+; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v6
+; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v14
 ; GISEL-NEXT:    v_mul_lo_u32 v18, v14, v15
-; GISEL-NEXT:    v_add_i32_e64 v6, s[8:9], v7, v6
-; GISEL-NEXT:    v_mul_hi_u32 v7, v1, v15
-; GISEL-NEXT:    v_add_i32_e64 v6, s[8:9], v6, v10
-; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v6
+; GISEL-NEXT:    v_add_i32_e64 v4, s[8:9], v5, v4
+; GISEL-NEXT:    v_mul_hi_u32 v5, v6, v15
+; GISEL-NEXT:    v_add_i32_e64 v4, s[8:9], v4, v10
+; GISEL-NEXT:    v_mul_lo_u32 v10, v6, v4
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[8:9], v18, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[8:9]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[8:9], v10, v7
-; GISEL-NEXT:    v_mov_b32_e32 v7, s10
+; GISEL-NEXT:    v_add_i32_e64 v5, s[8:9], v10, v5
+; GISEL-NEXT:    v_mov_b32_e32 v5, s10
 ; GISEL-NEXT:    v_mov_b32_e32 v10, s11
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[10:11], v8, v12
 ; GISEL-NEXT:    v_mov_b32_e32 v12, s12
@@ -2602,26 +2602,26 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v16, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[8:9]
 ; GISEL-NEXT:    v_add_i32_e64 v17, s[8:9], v18, v17
-; GISEL-NEXT:    v_mul_lo_u32 v18, v14, v6
-; GISEL-NEXT:    v_mul_hi_u32 v14, v14, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v1, v6
+; GISEL-NEXT:    v_mul_lo_u32 v18, v14, v4
+; GISEL-NEXT:    v_mul_hi_u32 v14, v14, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v6, v4
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[8:9], v18, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[8:9]
-; GISEL-NEXT:    v_add_i32_e64 v6, s[8:9], v13, v6
+; GISEL-NEXT:    v_add_i32_e64 v4, s[8:9], v13, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[8:9]
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[8:9], v18, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[6:7]
 ; GISEL-NEXT:    v_add_i32_e64 v18, s[6:7], v19, v18
 ; GISEL-NEXT:    v_mul_lo_u32 v19, v16, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v16, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v9
 ; GISEL-NEXT:    v_add_i32_e64 v15, s[6:7], v19, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[6:7]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v15, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[6:7]
 ; GISEL-NEXT:    v_add_i32_e64 v15, s[6:7], v19, v15
 ; GISEL-NEXT:    v_mov_b32_e32 v19, s13
-; GISEL-NEXT:    v_add_i32_e64 v6, s[6:7], v6, v17
+; GISEL-NEXT:    v_add_i32_e64 v4, s[6:7], v4, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[6:7]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v9, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[6:7]
@@ -2631,16 +2631,16 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[6:7], v16, v15
 ; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v13, vcc
 ; GISEL-NEXT:    v_addc_u32_e64 v11, vcc, v11, v14, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
 ; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, 0, v1
-; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v1
-; GISEL-NEXT:    v_mul_hi_u32 v1, 0, v1
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_mul_lo_u32 v8, 0, v4
+; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, 0, v4
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v11, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, 0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v14, v2, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, 0, v5
+; GISEL-NEXT:    v_mul_lo_u32 v11, 0, v7
+; GISEL-NEXT:    v_mul_hi_u32 v14, v2, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, 0, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v0, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v16, 0, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v17, v0, v6
@@ -2654,84 +2654,84 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v9, 0, v9
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v16, v1
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v16, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v11, v5
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v11, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v17
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v18, v17
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v15
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
-; GISEL-NEXT:    v_mul_lo_u32 v13, v3, v1
-; GISEL-NEXT:    v_mul_lo_u32 v15, 0, v1
-; GISEL-NEXT:    v_mul_hi_u32 v1, v3, v1
+; GISEL-NEXT:    v_mul_lo_u32 v13, v3, v4
+; GISEL-NEXT:    v_mul_lo_u32 v15, 0, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; GISEL-NEXT:    v_mul_lo_u32 v14, v4, v5
-; GISEL-NEXT:    v_mul_lo_u32 v16, 0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v4, v5
+; GISEL-NEXT:    v_mul_lo_u32 v14, v1, v7
+; GISEL-NEXT:    v_mul_lo_u32 v16, 0, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v1, v7
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v6, v3, v6
-; GISEL-NEXT:    v_mul_lo_u32 v8, v4, v8
+; GISEL-NEXT:    v_mul_lo_u32 v8, v1, v8
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v15, v6
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v16, v8
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v7
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT:    v_subb_u32_e64 v6, s[4:5], 0, v1, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], 0, v1
+; GISEL-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, v4, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v4, s[4:5], 0, v4
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v14
-; GISEL-NEXT:    v_subb_u32_e64 v9, s[6:7], 0, v5, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[6:7], 0, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v2, v4
+; GISEL-NEXT:    v_subb_u32_e64 v9, s[6:7], 0, v6, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v6, s[6:7], 0, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v2, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[6:7]
-; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[6:7]
+; GISEL-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GISEL-NEXT:    v_cndmask_b32_e32 v8, v12, v11, vcc
-; GISEL-NEXT:    v_subbrev_u32_e64 v5, vcc, 0, v5, s[4:5]
+; GISEL-NEXT:    v_subbrev_u32_e64 v6, vcc, 0, v6, s[4:5]
 ; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v0, v3
-; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v2, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v13, v4
+; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v2, v1
+; GISEL-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v13, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v11, v3
-; GISEL-NEXT:    v_subbrev_u32_e32 v12, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GISEL-NEXT:    v_subbrev_u32_e32 v12, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e32 v14, v19, v14, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v13, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v6, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v13, v4, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v5, v15, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v6, v15, s[4:5]
 ; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v7, v4, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index bfc1717e8275c..9578bc409ff13 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -11241,16 +11241,16 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
-; GFX6-NEXT:    s_ashr_i32 s12, s3, 31
-; GFX6-NEXT:    s_add_u32 s2, s2, s12
-; GFX6-NEXT:    s_mov_b32 s13, s12
-; GFX6-NEXT:    s_addc_u32 s3, s3, s12
-; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX6-NEXT:    s_sub_u32 s4, 0, s2
-; GFX6-NEXT:    s_subb_u32 s5, 0, s3
+; GFX6-NEXT:    s_lshl_b64 s[4:5], s[2:3], s4
+; GFX6-NEXT:    s_ashr_i32 s2, s5, 31
+; GFX6-NEXT:    s_add_u32 s4, s4, s2
+; GFX6-NEXT:    s_mov_b32 s3, s2
+; GFX6-NEXT:    s_addc_u32 s5, s5, s2
+; GFX6-NEXT:    s_xor_b64 s[12:13], s[4:5], s[2:3]
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s13
+; GFX6-NEXT:    s_sub_u32 s4, 0, s12
+; GFX6-NEXT:    s_subb_u32 s5, 0, s13
 ; GFX6-NEXT:    s_ashr_i32 s14, s11, 31
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
@@ -11328,23 +11328,23 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
-; GFX6-NEXT:    v_mov_b32_e32 v5, s3
+; GFX6-NEXT:    v_mul_lo_u32 v2, s12, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v0
+; GFX6-NEXT:    v_mul_lo_u32 v4, s13, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, s13
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v0
+; GFX6-NEXT:    v_mul_lo_u32 v3, s12, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s11, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
 ; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
-; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s2, v3
+; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s12, v3
 ; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v4
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v5
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v5
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
 ; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
@@ -11354,16 +11354,16 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
 ; GFX6-NEXT:    v_mov_b32_e32 v6, s11
 ; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v2
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    s_xor_b64 s[0:1], s[14:15], s[12:13]
+; GFX6-NEXT:    s_xor_b64 s[0:1], s[14:15], s[2:3]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s1, v1
@@ -11513,16 +11513,16 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX90A-NEXT:    s_mov_b64 s[2:3], 0x1000
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
-; GFX90A-NEXT:    s_ashr_i32 s8, s3, 31
-; GFX90A-NEXT:    s_add_u32 s2, s2, s8
-; GFX90A-NEXT:    s_mov_b32 s9, s8
-; GFX90A-NEXT:    s_addc_u32 s3, s3, s8
-; GFX90A-NEXT:    s_xor_b64 s[2:3], s[2:3], s[8:9]
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX90A-NEXT:    s_sub_u32 s10, 0, s2
-; GFX90A-NEXT:    s_subb_u32 s11, 0, s3
+; GFX90A-NEXT:    s_lshl_b64 s[4:5], s[2:3], s4
+; GFX90A-NEXT:    s_ashr_i32 s2, s5, 31
+; GFX90A-NEXT:    s_add_u32 s4, s4, s2
+; GFX90A-NEXT:    s_mov_b32 s3, s2
+; GFX90A-NEXT:    s_addc_u32 s5, s5, s2
+; GFX90A-NEXT:    s_xor_b64 s[8:9], s[4:5], s[2:3]
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s9
+; GFX90A-NEXT:    s_sub_u32 s10, 0, s8
+; GFX90A-NEXT:    s_subb_u32 s11, 0, s9
 ; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX90A-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
@@ -11599,39 +11599,39 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v3, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s2, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v4, s2, v0
+; GFX90A-NEXT:    v_mul_lo_u32 v3, s8, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v4, s8, v0
 ; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v4, s3, v0
+; GFX90A-NEXT:    v_mul_lo_u32 v4, s9, v0
 ; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s2, v0
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s8, v0
 ; GFX90A-NEXT:    v_sub_u32_e32 v4, s7, v3
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s3
+; GFX90A-NEXT:    v_mov_b32_e32 v6, s9
 ; GFX90A-NEXT:    v_sub_co_u32_e32 v5, vcc, s6, v5
 ; GFX90A-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s2, v5
+; GFX90A-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s8, v5
 ; GFX90A-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
-; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v4
+; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v6
+; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v6
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v4
+; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v4
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v7, v6, s[0:1]
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
+; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
 ; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s2, v5
+; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v4, 1, 2, s[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
 ; GFX90A-NEXT:    v_add_co_u32_e64 v4, s[0:1], v0, v4
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1]
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX90A-NEXT:    s_xor_b64 s[0:1], s[10:11], s[8:9]
+; GFX90A-NEXT:    s_xor_b64 s[0:1], s[10:11], s[2:3]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX90A-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX90A-NEXT:    v_xor_b32_e32 v1, s1, v1
@@ -12160,244 +12160,244 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_f32_e32 v1, s20, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mac_f32_e32 v0, s21, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v2, s6, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
-; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
-; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v0
+; GFX6-NEXT:    v_mul_lo_u32 v0, s6, v2
+; GFX6-NEXT:    v_mul_hi_u32 v1, s6, v3
+; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v3
+; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v3
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v0, v4
+; GFX6-NEXT:    v_mul_lo_u32 v0, v3, v1
+; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v5
+; GFX6-NEXT:    v_mul_hi_u32 v6, v3, v1
+; GFX6-NEXT:    v_mul_hi_u32 v7, v2, v1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v5
-; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v6, 0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
-; GFX6-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v2
-; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[2:3]
-; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v2
-; GFX6-NEXT:    v_mul_hi_u32 v7, s6, v0
-; GFX6-NEXT:    v_mul_lo_u32 v8, s7, v0
+; GFX6-NEXT:    v_mul_lo_u32 v6, v2, v5
+; GFX6-NEXT:    v_mul_hi_u32 v5, v2, v5
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v5, v2, v1
+; GFX6-NEXT:    v_mov_b32_e32 v0, 0
+; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v7, v0, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v1, v6, vcc
+; GFX6-NEXT:    v_add_i32_e64 v3, s[2:3], v3, v4
+; GFX6-NEXT:    v_addc_u32_e64 v4, vcc, v2, v5, s[2:3]
+; GFX6-NEXT:    v_mul_lo_u32 v6, s6, v4
+; GFX6-NEXT:    v_mul_hi_u32 v7, s6, v3
+; GFX6-NEXT:    v_mul_lo_u32 v8, s7, v3
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GFX6-NEXT:    v_mul_lo_u32 v7, s6, v0
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
-; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
-; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
-; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GFX6-NEXT:    v_mul_lo_u32 v7, s6, v3
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; GFX6-NEXT:    v_mul_lo_u32 v10, v3, v6
+; GFX6-NEXT:    v_mul_hi_u32 v11, v3, v7
+; GFX6-NEXT:    v_mul_hi_u32 v12, v3, v6
+; GFX6-NEXT:    v_mul_hi_u32 v9, v4, v7
+; GFX6-NEXT:    v_mul_lo_u32 v7, v4, v7
+; GFX6-NEXT:    v_mul_hi_u32 v8, v4, v6
 ; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v5
+; GFX6-NEXT:    v_mul_lo_u32 v4, v4, v6
 ; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
+; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v8, v0, vcc
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v1, v6, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v2, v6, s[2:3]
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_ashr_i32 s2, s9, 31
 ; GFX6-NEXT:    s_add_u32 s0, s8, s2
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    s_mov_b32 s3, s2
 ; GFX6-NEXT:    s_addc_u32 s1, s9, s2
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX6-NEXT:    s_xor_b64 s[8:9], s[0:1], s[2:3]
-; GFX6-NEXT:    v_mul_lo_u32 v2, s8, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s8, v0
-; GFX6-NEXT:    v_mul_hi_u32 v5, s8, v1
-; GFX6-NEXT:    v_mul_hi_u32 v7, s9, v1
-; GFX6-NEXT:    v_mul_lo_u32 v1, s9, v1
+; GFX6-NEXT:    v_mul_lo_u32 v4, s8, v2
+; GFX6-NEXT:    v_mul_hi_u32 v5, s8, v3
+; GFX6-NEXT:    v_mul_hi_u32 v6, s8, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, s9, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v2
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v6, s9, v3
+; GFX6-NEXT:    v_mul_hi_u32 v3, s9, v3
+; GFX6-NEXT:    s_xor_b64 s[16:17], s[2:3], s[16:17]
+; GFX6-NEXT:    s_ashr_i32 s2, s13, 31
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v7, v0, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, s9, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[16:17]
-; GFX6-NEXT:    s_mov_b32 s6, -1
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, s14, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s14, v0
-; GFX6-NEXT:    v_mul_lo_u32 v5, s15, v0
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v1, v4, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, s14, v3
+; GFX6-NEXT:    v_mul_hi_u32 v5, s14, v2
+; GFX6-NEXT:    v_mul_lo_u32 v6, s15, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v7, s15
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_mul_lo_u32 v3, s14, v0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s9, v2
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
-; GFX6-NEXT:    v_subb_u32_e64 v5, s[0:1], v5, v7, vcc
-; GFX6-NEXT:    v_subrev_i32_e64 v7, s[0:1], s14, v3
-; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v5
+; GFX6-NEXT:    s_mov_b32 s3, s2
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_mul_lo_u32 v5, s14, v2
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s9, v4
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s8, v5
+; GFX6-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v7, vcc
+; GFX6-NEXT:    v_subrev_i32_e64 v7, s[0:1], s14, v5
+; GFX6-NEXT:    v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v6
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
 ; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v7
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v5
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, v8, v7, s[0:1]
-; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v0
-; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
-; GFX6-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v0
-; GFX6-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1]
-; GFX6-NEXT:    s_ashr_i32 s8, s13, 31
-; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GFX6-NEXT:    s_add_u32 s12, s12, s8
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, v10, v8, s[0:1]
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v6
+; GFX6-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[0:1]
+; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v2
+; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1]
+; GFX6-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v2
+; GFX6-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
+; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
+; GFX6-NEXT:    s_add_u32 s8, s12, s2
+; GFX6-NEXT:    v_cndmask_b32_e64 v6, v10, v8, s[0:1]
 ; GFX6-NEXT:    v_mov_b32_e32 v8, s9
-; GFX6-NEXT:    s_mov_b32 s9, s8
-; GFX6-NEXT:    s_addc_u32 s13, s13, s8
-; GFX6-NEXT:    s_xor_b64 s[12:13], s[12:13], s[8:9]
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v10, s12
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v11, s13
-; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v8, v2, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s15, v2
+; GFX6-NEXT:    s_addc_u32 s9, s13, s2
+; GFX6-NEXT:    s_xor_b64 s[8:9], s[8:9], s[2:3]
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v10, s8
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v11, s9
+; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v8, v4, vcc
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s15, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s14, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s14, v5
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v4
 ; GFX6-NEXT:    v_mac_f32_e32 v10, s18, v11
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
-; GFX6-NEXT:    v_rcp_f32_e32 v3, v10
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v9, v7, s[0:1]
-; GFX6-NEXT:    v_mul_f32_e32 v3, s19, v3
-; GFX6-NEXT:    v_mul_f32_e32 v5, s20, v3
-; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX6-NEXT:    v_mac_f32_e32 v3, s21, v5
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
+; GFX6-NEXT:    v_rcp_f32_e32 v5, v10
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
+; GFX6-NEXT:    v_mul_f32_e32 v5, s19, v5
+; GFX6-NEXT:    v_mul_f32_e32 v6, s20, v5
+; GFX6-NEXT:    v_trunc_f32_e32 v6, v6
+; GFX6-NEXT:    v_mac_f32_e32 v5, s21, v6
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX6-NEXT:    s_sub_u32 s14, 0, s12
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_mul_hi_u32 v2, s14, v3
-; GFX6-NEXT:    v_mul_lo_u32 v7, s14, v5
-; GFX6-NEXT:    s_subb_u32 s15, 0, s13
-; GFX6-NEXT:    v_mul_lo_u32 v8, s15, v3
-; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
-; GFX6-NEXT:    v_mul_lo_u32 v7, s14, v3
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; GFX6-NEXT:    v_mul_lo_u32 v8, v3, v2
-; GFX6-NEXT:    v_mul_hi_u32 v9, v3, v7
-; GFX6-NEXT:    v_mul_hi_u32 v10, v3, v2
-; GFX6-NEXT:    v_mul_hi_u32 v11, v5, v2
-; GFX6-NEXT:    v_mul_lo_u32 v2, v5, v2
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX6-NEXT:    s_sub_u32 s12, 0, s8
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX6-NEXT:    v_mul_hi_u32 v4, s12, v5
+; GFX6-NEXT:    v_mul_lo_u32 v7, s12, v6
+; GFX6-NEXT:    s_subb_u32 s13, 0, s9
+; GFX6-NEXT:    v_mul_lo_u32 v8, s13, v5
+; GFX6-NEXT:    v_xor_b32_e32 v2, s16, v2
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GFX6-NEXT:    v_mul_lo_u32 v7, s12, v5
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GFX6-NEXT:    v_mul_lo_u32 v8, v5, v4
+; GFX6-NEXT:    v_mul_hi_u32 v9, v5, v7
+; GFX6-NEXT:    v_mul_hi_u32 v10, v5, v4
+; GFX6-NEXT:    v_mul_hi_u32 v11, v6, v4
+; GFX6-NEXT:    v_mul_lo_u32 v4, v6, v4
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v10, v5, v7
-; GFX6-NEXT:    v_mul_hi_u32 v7, v5, v7
-; GFX6-NEXT:    v_xor_b32_e32 v1, s3, v1
+; GFX6-NEXT:    v_mul_lo_u32 v10, v6, v7
+; GFX6-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GFX6-NEXT:    v_xor_b32_e32 v3, s17, v3
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v11, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v6, v8, vcc
-; GFX6-NEXT:    v_add_i32_e64 v2, s[0:1], v3, v2
-; GFX6-NEXT:    v_addc_u32_e64 v3, vcc, v5, v7, s[0:1]
-; GFX6-NEXT:    v_mul_lo_u32 v8, s14, v3
-; GFX6-NEXT:    v_mul_hi_u32 v9, s14, v2
-; GFX6-NEXT:    v_mul_lo_u32 v10, s15, v2
+; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v11, v0, vcc
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v1, v8, vcc
+; GFX6-NEXT:    v_add_i32_e64 v4, s[0:1], v5, v4
+; GFX6-NEXT:    v_addc_u32_e64 v5, vcc, v6, v7, s[0:1]
+; GFX6-NEXT:    v_mul_lo_u32 v8, s12, v5
+; GFX6-NEXT:    v_mul_hi_u32 v9, s12, v4
+; GFX6-NEXT:    v_mul_lo_u32 v10, s13, v4
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GFX6-NEXT:    v_mul_lo_u32 v9, s14, v2
+; GFX6-NEXT:    v_mul_lo_u32 v9, s12, v4
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GFX6-NEXT:    v_mul_lo_u32 v12, v2, v8
-; GFX6-NEXT:    v_mul_hi_u32 v13, v2, v9
-; GFX6-NEXT:    v_mul_hi_u32 v14, v2, v8
-; GFX6-NEXT:    v_mul_hi_u32 v11, v3, v9
-; GFX6-NEXT:    v_mul_lo_u32 v9, v3, v9
-; GFX6-NEXT:    v_mul_hi_u32 v10, v3, v8
+; GFX6-NEXT:    v_mul_lo_u32 v12, v4, v8
+; GFX6-NEXT:    v_mul_hi_u32 v13, v4, v9
+; GFX6-NEXT:    v_mul_hi_u32 v14, v4, v8
+; GFX6-NEXT:    v_mul_hi_u32 v11, v5, v9
+; GFX6-NEXT:    v_mul_lo_u32 v9, v5, v9
+; GFX6-NEXT:    v_mul_hi_u32 v10, v5, v8
 ; GFX6-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; GFX6-NEXT:    v_addc_u32_e32 v13, vcc, 0, v14, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v3, v3, v8
+; GFX6-NEXT:    v_mul_lo_u32 v5, v5, v8
 ; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
 ; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v13, v11, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v10, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v6, v8, vcc
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GFX6-NEXT:    s_ashr_i32 s14, s11, 31
-; GFX6-NEXT:    v_addc_u32_e64 v5, vcc, v5, v8, s[0:1]
-; GFX6-NEXT:    s_add_u32 s0, s10, s14
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT:    s_mov_b32 s15, s14
-; GFX6-NEXT:    s_addc_u32 s1, s11, s14
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
-; GFX6-NEXT:    v_mul_lo_u32 v5, s10, v3
-; GFX6-NEXT:    v_mul_hi_u32 v7, s10, v2
-; GFX6-NEXT:    v_mul_hi_u32 v9, s10, v3
-; GFX6-NEXT:    v_mul_hi_u32 v10, s11, v3
-; GFX6-NEXT:    v_mul_lo_u32 v3, s11, v3
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v10, v0, vcc
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v1, v8, vcc
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GFX6-NEXT:    s_ashr_i32 s12, s11, 31
+; GFX6-NEXT:    v_addc_u32_e64 v6, vcc, v6, v8, s[0:1]
+; GFX6-NEXT:    s_add_u32 s0, s10, s12
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GFX6-NEXT:    s_mov_b32 s13, s12
+; GFX6-NEXT:    s_addc_u32 s1, s11, s12
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
+; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[12:13]
+; GFX6-NEXT:    v_mul_lo_u32 v6, s10, v5
+; GFX6-NEXT:    v_mul_hi_u32 v7, s10, v4
+; GFX6-NEXT:    v_mul_hi_u32 v9, s10, v5
+; GFX6-NEXT:    v_mul_hi_u32 v10, s11, v5
+; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v5
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v9, s11, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s11, v2
-; GFX6-NEXT:    v_mov_b32_e32 v8, s3
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v2, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v10, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, s12, v3
-; GFX6-NEXT:    v_mul_hi_u32 v5, s12, v2
-; GFX6-NEXT:    v_mul_lo_u32 v6, s13, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
-; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GFX6-NEXT:    v_mul_lo_u32 v5, s12, v2
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s11, v4
-; GFX6-NEXT:    v_mov_b32_e32 v7, s13
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s10, v5
+; GFX6-NEXT:    v_mul_lo_u32 v9, s11, v4
+; GFX6-NEXT:    v_mul_hi_u32 v4, s11, v4
+; GFX6-NEXT:    v_mov_b32_e32 v8, s17
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v7, v4, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v10, v0, vcc
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v1, v0, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v6, s8, v5
+; GFX6-NEXT:    v_mul_hi_u32 v7, s8, v4
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s16, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v4
+; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v3, v8, vcc
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v7, v6
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_mul_lo_u32 v3, s8, v4
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s11, v2
+; GFX6-NEXT:    v_mov_b32_e32 v7, s9
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
 ; GFX6-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v7, vcc
-; GFX6-NEXT:    v_subrev_i32_e64 v7, s[0:1], s12, v5
+; GFX6-NEXT:    v_subrev_i32_e64 v7, s[0:1], s8, v3
 ; GFX6-NEXT:    v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v6
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v6
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v7
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v6
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v6
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[0:1]
-; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v2
-; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1]
-; GFX6-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v2
-; GFX6-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
+; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v4
+; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v5, s[0:1]
+; GFX6-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v4
+; GFX6-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v5, s[0:1]
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v10, v8, s[0:1]
 ; GFX6-NEXT:    v_mov_b32_e32 v8, s11
-; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s13, v4
+; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v8, v2, vcc
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v5
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v4
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT:    s_xor_b64 s[0:1], s[14:15], s[8:9]
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v2, s0, v2
-; GFX6-NEXT:    v_xor_b32_e32 v3, s1, v3
-; GFX6-NEXT:    v_mov_b32_e32 v4, s1
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v2
-; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX6-NEXT:    s_xor_b64 s[0:1], s[12:13], s[2:3]
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX6-NEXT:    v_xor_b32_e32 v3, s0, v3
+; GFX6-NEXT:    v_xor_b32_e32 v4, s1, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, s1
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v3
+; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v4, v5, vcc
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -12423,214 +12423,214 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    s_subb_u32 s4, 0, s11
 ; GFX9-NEXT:    v_mac_f32_e32 v0, s16, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, s17, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, s18, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, s19, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v2, s14, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s14, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, s14, v0
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v6, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v4, s14, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, s14, v0
-; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v9, s14, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
+; GFX9-NEXT:    v_mul_lo_u32 v0, s14, v2
+; GFX9-NEXT:    v_mul_hi_u32 v1, s14, v3
+; GFX9-NEXT:    v_mul_lo_u32 v5, s4, v3
+; GFX9-NEXT:    v_mul_lo_u32 v4, s14, v3
+; GFX9-NEXT:    v_add_u32_e32 v0, v1, v0
+; GFX9-NEXT:    v_add_u32_e32 v5, v0, v5
+; GFX9-NEXT:    v_mul_hi_u32 v1, v3, v4
+; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v5
+; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v5
+; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v5
+; GFX9-NEXT:    v_mul_lo_u32 v5, v2, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, v2, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, v2, v4
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v8, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v6, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v3, s[2:3], v3, v4
+; GFX9-NEXT:    v_addc_co_u32_e64 v4, vcc, v2, v5, s[2:3]
+; GFX9-NEXT:    v_mul_lo_u32 v6, s14, v4
+; GFX9-NEXT:    v_mul_hi_u32 v7, s14, v3
+; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v3
+; GFX9-NEXT:    v_mul_lo_u32 v9, s14, v3
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX9-NEXT:    v_add_u32_e32 v4, v7, v4
-; GFX9-NEXT:    v_add_u32_e32 v4, v4, v8
-; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v9
-; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v9
-; GFX9-NEXT:    v_mul_lo_u32 v9, v2, v9
-; GFX9-NEXT:    v_mul_hi_u32 v7, v2, v4
+; GFX9-NEXT:    v_add_u32_e32 v6, v7, v6
+; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
+; GFX9-NEXT:    v_mul_lo_u32 v10, v3, v6
+; GFX9-NEXT:    v_mul_hi_u32 v11, v3, v9
+; GFX9-NEXT:    v_mul_hi_u32 v12, v3, v6
+; GFX9-NEXT:    v_mul_hi_u32 v8, v4, v9
+; GFX9-NEXT:    v_mul_lo_u32 v9, v4, v9
+; GFX9-NEXT:    v_mul_hi_u32 v7, v4, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
 ; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
+; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v4, vcc
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v0, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v1, v6, vcc
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s14, s5, 31
-; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
+; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v2, v6, s[2:3]
 ; GFX9-NEXT:    s_add_u32 s2, s4, s14
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
 ; GFX9-NEXT:    s_mov_b32 s15, s14
 ; GFX9-NEXT:    s_addc_u32 s3, s5, s14
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[2:3], s[14:15]
-; GFX9-NEXT:    v_mul_lo_u32 v2, s4, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s4, v1
-; GFX9-NEXT:    v_mul_hi_u32 v7, s5, v1
-; GFX9-NEXT:    v_mul_lo_u32 v1, s5, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s5, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX9-NEXT:    v_mul_lo_u32 v4, s4, v2
+; GFX9-NEXT:    v_mul_hi_u32 v5, s4, v3
+; GFX9-NEXT:    v_mul_hi_u32 v6, s4, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, s5, v2
+; GFX9-NEXT:    v_mul_lo_u32 v2, s5, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, s5, v3
+; GFX9-NEXT:    v_mul_hi_u32 v3, s5, v3
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s10, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s10, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, s11, v0
+; GFX9-NEXT:    s_xor_b64 s[12:13], s[14:15], s[12:13]
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v0, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v4, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v4, s10, v3
+; GFX9-NEXT:    v_mul_hi_u32 v5, s10, v2
+; GFX9-NEXT:    v_mul_lo_u32 v6, s11, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s11
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v0
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
-; GFX9-NEXT:    v_sub_u32_e32 v4, s5, v2
-; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s4, v3
-; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v7, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s10, v3
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v4
+; GFX9-NEXT:    v_add_u32_e32 v4, v5, v4
+; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v2
+; GFX9-NEXT:    v_add_u32_e32 v4, v4, v6
+; GFX9-NEXT:    v_sub_u32_e32 v6, s5, v4
+; GFX9-NEXT:    v_sub_co_u32_e32 v5, vcc, s4, v5
+; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v6, v7, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s10, v5
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v7, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 1, 2, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[0:1], v0, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 1, 2, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], v2, v6
+; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v3, s[0:1]
+; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
+; GFX9-NEXT:    s_add_u32 s0, s8, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v8, s5
-; GFX9-NEXT:    s_xor_b64 s[4:5], s[14:15], s[12:13]
-; GFX9-NEXT:    s_ashr_i32 s12, s9, 31
-; GFX9-NEXT:    s_add_u32 s0, s8, s12
-; GFX9-NEXT:    s_mov_b32 s13, s12
-; GFX9-NEXT:    s_addc_u32 s1, s9, s12
-; GFX9-NEXT:    s_xor_b64 s[8:9], s[0:1], s[12:13]
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v8, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s5, s4
+; GFX9-NEXT:    s_addc_u32 s1, s9, s4
+; GFX9-NEXT:    s_xor_b64 s[8:9], s[0:1], s[4:5]
+; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v8, v4, vcc
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, s8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v9, s9
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v5
 ; GFX9-NEXT:    v_mac_f32_e32 v8, s16, v9
 ; GFX9-NEXT:    v_rcp_f32_e32 v8, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_mul_f32_e32 v2, s17, v8
-; GFX9-NEXT:    v_mul_f32_e32 v3, s18, v2
-; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX9-NEXT:    v_mac_f32_e32 v2, s19, v3
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v10, v5, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_mul_f32_e32 v4, s17, v8
+; GFX9-NEXT:    v_mul_f32_e32 v5, s18, v4
+; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX9-NEXT:    v_mac_f32_e32 v4, s19, v5
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX9-NEXT:    s_sub_u32 s10, 0, s8
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
 ; GFX9-NEXT:    s_subb_u32 s11, 0, s9
-; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v2
-; GFX9-NEXT:    v_mul_lo_u32 v8, s10, v3
-; GFX9-NEXT:    v_mul_lo_u32 v9, s11, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, s10, v2
-; GFX9-NEXT:    v_add_u32_e32 v4, v4, v8
-; GFX9-NEXT:    v_add_u32_e32 v4, v4, v9
-; GFX9-NEXT:    v_mul_lo_u32 v8, v2, v4
-; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v7
-; GFX9-NEXT:    v_mul_hi_u32 v10, v2, v4
-; GFX9-NEXT:    v_mul_hi_u32 v11, v3, v4
-; GFX9-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GFX9-NEXT:    v_mul_hi_u32 v6, s10, v4
+; GFX9-NEXT:    v_mul_lo_u32 v8, s10, v5
+; GFX9-NEXT:    v_mul_lo_u32 v9, s11, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, s10, v4
+; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
+; GFX9-NEXT:    v_add_u32_e32 v6, v6, v9
+; GFX9-NEXT:    v_mul_lo_u32 v8, v4, v6
+; GFX9-NEXT:    v_mul_hi_u32 v9, v4, v7
+; GFX9-NEXT:    v_mul_hi_u32 v10, v4, v6
+; GFX9-NEXT:    v_mul_hi_u32 v11, v5, v6
+; GFX9-NEXT:    v_mul_lo_u32 v6, v5, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v10, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v10, v3, v7
-; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v7
-; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s5, v1
+; GFX9-NEXT:    v_mul_lo_u32 v10, v5, v7
+; GFX9-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GFX9-NEXT:    v_xor_b32_e32 v2, s12, v2
+; GFX9-NEXT:    v_xor_b32_e32 v3, s13, v3
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v9, v7, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v7, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v5, v8, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v2, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v4, vcc, v3, v7, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v8, s10, v4
-; GFX9-NEXT:    v_mul_hi_u32 v9, s10, v2
-; GFX9-NEXT:    v_mul_lo_u32 v10, s11, v2
-; GFX9-NEXT:    v_mul_lo_u32 v11, s10, v2
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v0, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v1, v8, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v4, s[0:1], v4, v6
+; GFX9-NEXT:    v_addc_co_u32_e64 v6, vcc, v5, v7, s[0:1]
+; GFX9-NEXT:    v_mul_lo_u32 v8, s10, v6
+; GFX9-NEXT:    v_mul_hi_u32 v9, s10, v4
+; GFX9-NEXT:    v_mul_lo_u32 v10, s11, v4
+; GFX9-NEXT:    v_mul_lo_u32 v11, s10, v4
+; GFX9-NEXT:    v_add_u32_e32 v5, v5, v7
 ; GFX9-NEXT:    v_add_u32_e32 v8, v9, v8
 ; GFX9-NEXT:    v_add_u32_e32 v8, v8, v10
-; GFX9-NEXT:    v_mul_lo_u32 v12, v2, v8
-; GFX9-NEXT:    v_mul_hi_u32 v13, v2, v11
-; GFX9-NEXT:    v_mul_hi_u32 v14, v2, v8
-; GFX9-NEXT:    v_mul_hi_u32 v10, v4, v11
-; GFX9-NEXT:    v_mul_lo_u32 v11, v4, v11
-; GFX9-NEXT:    v_mul_hi_u32 v9, v4, v8
+; GFX9-NEXT:    v_mul_lo_u32 v12, v4, v8
+; GFX9-NEXT:    v_mul_hi_u32 v13, v4, v11
+; GFX9-NEXT:    v_mul_hi_u32 v14, v4, v8
+; GFX9-NEXT:    v_mul_hi_u32 v10, v6, v11
+; GFX9-NEXT:    v_mul_lo_u32 v11, v6, v11
+; GFX9-NEXT:    v_mul_hi_u32 v9, v6, v8
 ; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v13, v12
 ; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v14, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v8
+; GFX9-NEXT:    v_mul_lo_u32 v6, v6, v8
 ; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v12, v11
 ; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v13, v10, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v9, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v10, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v5, v8, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v9, v0, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v10, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v1, v8, vcc
 ; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
-; GFX9-NEXT:    v_addc_co_u32_e64 v3, vcc, v3, v8, s[0:1]
+; GFX9-NEXT:    v_addc_co_u32_e64 v5, vcc, v5, v8, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s6, s10
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
 ; GFX9-NEXT:    s_mov_b32 s11, s10
 ; GFX9-NEXT:    s_addc_u32 s1, s7, s10
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
-; GFX9-NEXT:    v_mul_lo_u32 v4, s6, v3
-; GFX9-NEXT:    v_mul_hi_u32 v7, s6, v2
-; GFX9-NEXT:    v_mul_hi_u32 v9, s6, v3
-; GFX9-NEXT:    v_mul_hi_u32 v10, s7, v3
-; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v7, v4
+; GFX9-NEXT:    v_mul_lo_u32 v6, s6, v5
+; GFX9-NEXT:    v_mul_hi_u32 v7, s6, v4
+; GFX9-NEXT:    v_mul_hi_u32 v9, s6, v5
+; GFX9-NEXT:    v_mul_hi_u32 v10, s7, v5
+; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v9, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v2
-; GFX9-NEXT:    v_mul_hi_u32 v2, s7, v2
-; GFX9-NEXT:    v_mov_b32_e32 v8, s5
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v2, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v10, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s8, v3
-; GFX9-NEXT:    v_mul_hi_u32 v5, s8, v2
-; GFX9-NEXT:    v_mul_lo_u32 v7, s9, v2
-; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s4, v0
-; GFX9-NEXT:    v_add_u32_e32 v4, v5, v4
-; GFX9-NEXT:    v_mul_lo_u32 v5, s8, v2
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
-; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
-; GFX9-NEXT:    v_sub_u32_e32 v7, s7, v4
+; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, s7, v4
+; GFX9-NEXT:    v_mov_b32_e32 v8, s13
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v9
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v10, v0, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v6, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v5
+; GFX9-NEXT:    v_mul_hi_u32 v7, s8, v4
+; GFX9-NEXT:    v_mul_lo_u32 v9, s9, v4
+; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s12, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v8, vcc
+; GFX9-NEXT:    v_add_u32_e32 v3, v7, v6
+; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v4
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v9
+; GFX9-NEXT:    v_sub_u32_e32 v7, s7, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v8, s9
-; GFX9-NEXT:    v_sub_co_u32_e32 v5, vcc, s6, v5
+; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, s6, v6
 ; GFX9-NEXT:    v_subb_co_u32_e64 v7, s[0:1], v7, v8, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v8, s[0:1], s8, v5
+; GFX9-NEXT:    v_subrev_co_u32_e64 v8, s[0:1], s8, v6
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[0:1], 0, v7, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
@@ -12639,28 +12639,28 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, v8, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v9, s7
-; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v9, v4, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v4
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v9, v3, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 1, 2, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v4
-; GFX9-NEXT:    v_add_co_u32_e64 v7, s[0:1], v2, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v8, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[12:13]
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v2, s0, v2
-; GFX9-NEXT:    v_xor_b32_e32 v3, s1, v3
-; GFX9-NEXT:    v_mov_b32_e32 v4, s1
-; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
+; GFX9-NEXT:    v_add_co_u32_e64 v7, s[0:1], v4, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v8, s[0:1], 0, v5, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v7, vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v8, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v3, s0, v3
+; GFX9-NEXT:    v_xor_b32_e32 v4, s1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v5, s1
+; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s0, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v5, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[2:3]
+; GFX9-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: sdiv_v2i64_pow2_shl_denom:
@@ -12711,183 +12711,183 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v2
-; GFX90A-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[0:1]
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s14, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v2, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
+; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v5, s[0:1]
+; GFX90A-NEXT:    v_mul_lo_u32 v6, s14, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, s14, v0
-; GFX90A-NEXT:    v_add_u32_e32 v5, v7, v5
+; GFX90A-NEXT:    v_add_u32_e32 v6, v7, v6
 ; GFX90A-NEXT:    v_mul_lo_u32 v7, s15, v0
-; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v7
+; GFX90A-NEXT:    v_add_u32_e32 v6, v6, v7
 ; GFX90A-NEXT:    v_mul_lo_u32 v8, s14, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v2, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v10, v2, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v12, v0, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v3, v8
+; GFX90A-NEXT:    v_mul_lo_u32 v10, v3, v8
+; GFX90A-NEXT:    v_mul_lo_u32 v12, v0, v6
 ; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v8
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v0, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v11, v0, v6
 ; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
 ; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v2, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v3, v6
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v2, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v7, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v6
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v2, v7, vcc
+; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v5
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_ashr_i32 s14, s5, 31
-; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1]
+; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v6, s[0:1]
 ; GFX90A-NEXT:    s_add_u32 s0, s4, s14
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX90A-NEXT:    s_mov_b32 s15, s14
 ; GFX90A-NEXT:    s_addc_u32 s1, s5, s14
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], s[0:1], s[14:15]
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s4, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s4, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v2, s4, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s4, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v6, s4, v0
+; GFX90A-NEXT:    v_mul_hi_u32 v3, s4, v1
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, s5, v0
 ; GFX90A-NEXT:    v_mul_lo_u32 v0, s5, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s5, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v2, v7, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v4, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v5, v0
+; GFX90A-NEXT:    v_mul_hi_u32 v6, s5, v1
+; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v7, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, s5, v1
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s12, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s12, v0
-; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s13, v0
-; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s12, v0
-; GFX90A-NEXT:    v_sub_u32_e32 v3, s5, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v3, s12, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v5, s12, v0
+; GFX90A-NEXT:    v_add_u32_e32 v3, v5, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s13, v0
+; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v6, s12, v0
+; GFX90A-NEXT:    v_sub_u32_e32 v5, s5, v3
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, s13
-; GFX90A-NEXT:    v_sub_co_u32_e32 v5, vcc, s4, v5
-; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v7, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s12, v5
-; GFX90A-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v3
+; GFX90A-NEXT:    v_sub_co_u32_e32 v6, vcc, s4, v6
+; GFX90A-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v7, vcc
+; GFX90A-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s12, v6
+; GFX90A-NEXT:    v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1]
+; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v5
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
 ; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v7
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v8, v7, s[0:1]
-; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, 1, 2, s[0:1]
+; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v5
+; GFX90A-NEXT:    v_cndmask_b32_e64 v5, v8, v7, s[0:1]
+; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
+; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 1, 2, s[0:1]
 ; GFX90A-NEXT:    v_mov_b32_e32 v8, s5
-; GFX90A-NEXT:    v_add_co_u32_e64 v3, s[0:1], v0, v3
-; GFX90A-NEXT:    v_subb_co_u32_e32 v2, vcc, v8, v2, vcc
+; GFX90A-NEXT:    v_add_co_u32_e64 v5, s[0:1], v0, v5
+; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v8, v3, vcc
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1]
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
+; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s13, v3
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s12, v5
+; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s12, v6
 ; GFX90A-NEXT:    s_xor_b64 s[0:1], s[14:15], s[10:11]
 ; GFX90A-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v3
 ; GFX90A-NEXT:    s_add_u32 s8, s8, s4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v8, v5, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
 ; GFX90A-NEXT:    s_mov_b32 s5, s4
 ; GFX90A-NEXT:    s_addc_u32 s9, s9, s4
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GFX90A-NEXT:    s_xor_b64 s[8:9], s[8:9], s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v2, s8
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s9
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s8
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s9
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX90A-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX90A-NEXT:    s_sub_u32 s10, 0, s8
-; GFX90A-NEXT:    v_mac_f32_e32 v2, s16, v3
-; GFX90A-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX90A-NEXT:    v_mac_f32_e32 v3, s16, v5
+; GFX90A-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX90A-NEXT:    v_xor_b32_e32 v1, s1, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v5, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v6, s1
 ; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v2, s17, v2
-; GFX90A-NEXT:    v_mul_f32_e32 v3, s18, v2
-; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX90A-NEXT:    v_mac_f32_e32 v2, s19, v3
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX90A-NEXT:    v_mul_f32_e32 v3, s17, v3
+; GFX90A-NEXT:    v_mul_f32_e32 v5, s18, v3
+; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX90A-NEXT:    v_mac_f32_e32 v3, s19, v5
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX90A-NEXT:    s_subb_u32 s11, 0, s9
-; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s10, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v8, s10, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s11, v2
+; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v7, s10, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v8, s10, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v6, s11, v3
 ; GFX90A-NEXT:    v_add_u32_e32 v7, v7, v8
-; GFX90A-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v9, s10, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v8, v2, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v2, v9
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v2, v5
+; GFX90A-NEXT:    v_add_u32_e32 v6, v7, v6
+; GFX90A-NEXT:    v_mul_lo_u32 v9, s10, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v8, v3, v6
+; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v9
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v3, v6
 ; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v3, v9
-; GFX90A-NEXT:    v_mul_lo_u32 v9, v3, v9
+; GFX90A-NEXT:    v_mul_hi_u32 v11, v5, v9
+; GFX90A-NEXT:    v_mul_lo_u32 v9, v5, v9
 ; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v9
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v10, v5, v6
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v11, vcc
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v10, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v6, v8, vcc
-; GFX90A-NEXT:    v_add_co_u32_e64 v2, s[0:1], v2, v5
-; GFX90A-NEXT:    v_addc_co_u32_e64 v5, vcc, v3, v7, s[0:1]
-; GFX90A-NEXT:    v_mul_lo_u32 v8, s10, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v9, s10, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v6, v5, v6
+; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
+; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v2, v8, vcc
+; GFX90A-NEXT:    v_add_co_u32_e64 v3, s[0:1], v3, v6
+; GFX90A-NEXT:    v_addc_co_u32_e64 v6, vcc, v5, v7, s[0:1]
+; GFX90A-NEXT:    v_mul_lo_u32 v8, s10, v6
+; GFX90A-NEXT:    v_mul_hi_u32 v9, s10, v3
 ; GFX90A-NEXT:    v_add_u32_e32 v8, v9, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v9, s11, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v9, s11, v3
 ; GFX90A-NEXT:    v_add_u32_e32 v8, v8, v9
-; GFX90A-NEXT:    v_mul_lo_u32 v10, s10, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v5, v10
-; GFX90A-NEXT:    v_mul_lo_u32 v12, v5, v10
-; GFX90A-NEXT:    v_mul_lo_u32 v14, v2, v8
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v2, v10
-; GFX90A-NEXT:    v_mul_hi_u32 v13, v2, v8
+; GFX90A-NEXT:    v_mul_lo_u32 v10, s10, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v11, v6, v10
+; GFX90A-NEXT:    v_mul_lo_u32 v12, v6, v10
+; GFX90A-NEXT:    v_mul_lo_u32 v14, v3, v8
+; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v10
+; GFX90A-NEXT:    v_mul_hi_u32 v13, v3, v8
 ; GFX90A-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v14
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
 ; GFX90A-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v12
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v5, v8
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v6, v8
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v10, vcc, v13, v11, vcc
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v5, v8
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v10, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v6, v9, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v7
+; GFX90A-NEXT:    v_mul_lo_u32 v6, v6, v8
+; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, v10, v6
+; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v2, v9, vcc
+; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v7
 ; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
-; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v3, v8, s[0:1]
+; GFX90A-NEXT:    v_addc_co_u32_e64 v5, vcc, v5, v8, s[0:1]
 ; GFX90A-NEXT:    s_add_u32 s0, s6, s10
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
 ; GFX90A-NEXT:    s_mov_b32 s11, s10
 ; GFX90A-NEXT:    s_addc_u32 s1, s7, s10
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
 ; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
-; GFX90A-NEXT:    v_mul_lo_u32 v7, s6, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v8, s6, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v7, s6, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v8, s6, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v6, s6, v5
 ; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v9, s7, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s7, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v7, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v8, s7, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v9, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v4, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v9, s7, v3
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, s7, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s8, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s8, v2
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v7, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v8, s7, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v9, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v8, v4, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s7, v5
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v6, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s8, v2
+; GFX90A-NEXT:    v_mul_hi_u32 v6, s8, v3
 ; GFX90A-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s9, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v6, s9, v3
 ; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v7, s8, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v7, s8, v3
 ; GFX90A-NEXT:    v_sub_u32_e32 v6, s7, v5
 ; GFX90A-NEXT:    v_mov_b32_e32 v8, s9
 ; GFX90A-NEXT:    v_sub_co_u32_e32 v7, vcc, s6, v7
@@ -12909,18 +12909,18 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v6, 1, 2, s[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v5
-; GFX90A-NEXT:    v_add_co_u32_e64 v6, s[0:1], v2, v6
+; GFX90A-NEXT:    v_add_co_u32_e64 v6, s[0:1], v3, v6
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v7, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e64 v8, s[0:1], 0, v3, s[0:1]
+; GFX90A-NEXT:    v_addc_co_u32_e64 v8, s[0:1], 0, v2, s[0:1]
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
 ; GFX90A-NEXT:    s_xor_b64 s[0:1], s[10:11], s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v2, s0, v2
-; GFX90A-NEXT:    v_xor_b32_e32 v3, s1, v3
-; GFX90A-NEXT:    v_mov_b32_e32 v5, s1
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v2
-; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX90A-NEXT:    v_xor_b32_e32 v3, s0, v3
+; GFX90A-NEXT:    v_xor_b32_e32 v5, s1, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v6, s1
+; GFX90A-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v3
+; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
 ; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX90A-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
@@ -13910,211 +13910,211 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_f32_e32 v1, s20, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mac_f32_e32 v0, s21, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_ashr_i32 s12, s9, 31
 ; GFX6-NEXT:    s_add_u32 s0, s8, s12
-; GFX6-NEXT:    v_mul_lo_u32 v2, s6, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
-; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
-; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v0
+; GFX6-NEXT:    v_mul_lo_u32 v0, s6, v2
+; GFX6-NEXT:    v_mul_hi_u32 v1, s6, v3
+; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v3
+; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v3
 ; GFX6-NEXT:    s_mov_b32 s13, s12
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v5
-; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v5
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v0, v4
+; GFX6-NEXT:    v_mul_lo_u32 v0, v3, v1
+; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v5
+; GFX6-NEXT:    v_mul_hi_u32 v6, v3, v1
+; GFX6-NEXT:    v_mul_hi_u32 v7, v2, v1
 ; GFX6-NEXT:    s_addc_u32 s1, s9, s12
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v6, v2, v5
+; GFX6-NEXT:    v_mul_hi_u32 v5, v2, v5
 ; GFX6-NEXT:    s_xor_b64 s[8:9], s[0:1], s[12:13]
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v6, 0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
-; GFX6-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v2
-; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[2:3]
-; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v2
-; GFX6-NEXT:    v_mul_hi_u32 v7, s6, v0
-; GFX6-NEXT:    v_mul_lo_u32 v8, s7, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v5, v2, v1
+; GFX6-NEXT:    v_mov_b32_e32 v0, 0
+; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v7, v0, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v1, v6, vcc
+; GFX6-NEXT:    v_add_i32_e64 v3, s[2:3], v3, v4
+; GFX6-NEXT:    v_addc_u32_e64 v4, vcc, v2, v5, s[2:3]
+; GFX6-NEXT:    v_mul_lo_u32 v6, s6, v4
+; GFX6-NEXT:    v_mul_hi_u32 v7, s6, v3
+; GFX6-NEXT:    v_mul_lo_u32 v8, s7, v3
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GFX6-NEXT:    v_mul_lo_u32 v7, s6, v0
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
-; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
-; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
-; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GFX6-NEXT:    v_mul_lo_u32 v7, s6, v3
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; GFX6-NEXT:    v_mul_lo_u32 v10, v3, v6
+; GFX6-NEXT:    v_mul_hi_u32 v11, v3, v7
+; GFX6-NEXT:    v_mul_hi_u32 v12, v3, v6
+; GFX6-NEXT:    v_mul_hi_u32 v9, v4, v7
+; GFX6-NEXT:    v_mul_lo_u32 v7, v4, v7
+; GFX6-NEXT:    v_mul_hi_u32 v8, v4, v6
 ; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v5
+; GFX6-NEXT:    v_mul_lo_u32 v4, v4, v6
 ; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, s8, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s8, v0
-; GFX6-NEXT:    v_mul_hi_u32 v5, s8, v1
-; GFX6-NEXT:    v_mul_hi_u32 v7, s9, v1
-; GFX6-NEXT:    v_mul_lo_u32 v1, s9, v1
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, s9, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v8, v0, vcc
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v1, v6, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v1, s16, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, s16, v0
-; GFX6-NEXT:    v_mul_lo_u32 v3, s17, v0
-; GFX6-NEXT:    v_mul_lo_u32 v0, s16, v0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s9, v1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s17
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
-; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
-; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s16, v0
-; GFX6-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v2, s[0:1]
+; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v2, v6, s[2:3]
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, s8, v2
+; GFX6-NEXT:    v_mul_hi_u32 v5, s8, v3
+; GFX6-NEXT:    v_mul_hi_u32 v6, s8, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, s9, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v2
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v6, s9, v3
+; GFX6-NEXT:    v_mul_hi_u32 v3, s9, v3
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v7, v0, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v1, v4, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v3, s16, v3
+; GFX6-NEXT:    v_mul_hi_u32 v4, s16, v2
+; GFX6-NEXT:    v_mul_lo_u32 v5, s17, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, s16, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s9, v3
+; GFX6-NEXT:    v_mov_b32_e32 v5, s17
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s8, v2
+; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
+; GFX6-NEXT:    v_subrev_i32_e64 v6, s[0:1], s16, v2
+; GFX6-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1]
 ; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s17, v7
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s16, v5
-; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s16, v6
+; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s17, v7
-; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s16, v5
+; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s16, v6
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
-; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
+; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
 ; GFX6-NEXT:    s_ashr_i32 s2, s15, 31
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
 ; GFX6-NEXT:    s_add_u32 s8, s14, s2
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[0:1]
 ; GFX6-NEXT:    v_mov_b32_e32 v7, s9
 ; GFX6-NEXT:    s_mov_b32 s3, s2
 ; GFX6-NEXT:    s_addc_u32 s9, s15, s2
 ; GFX6-NEXT:    s_xor_b64 s[8:9], s[8:9], s[2:3]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v8, s8
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v9, s9
-; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v7, v1, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s17, v1
+; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s17, v3
 ; GFX6-NEXT:    v_mac_f32_e32 v8, s18, v9
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s16, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s16, v2
 ; GFX6-NEXT:    v_rcp_f32_e32 v8, v8
 ; GFX6-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s17, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s17, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v3, s[0:1]
-; GFX6-NEXT:    v_mul_f32_e32 v3, s19, v8
-; GFX6-NEXT:    v_mul_f32_e32 v5, s20, v3
-; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX6-NEXT:    v_mac_f32_e32 v3, s21, v5
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
+; GFX6-NEXT:    v_mul_f32_e32 v5, s19, v8
+; GFX6-NEXT:    v_mul_f32_e32 v6, s20, v5
+; GFX6-NEXT:    v_trunc_f32_e32 v6, v6
+; GFX6-NEXT:    v_mac_f32_e32 v5, s21, v6
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX6-NEXT:    s_sub_u32 s2, 0, s8
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_mul_hi_u32 v2, s2, v3
-; GFX6-NEXT:    v_mul_lo_u32 v7, s2, v5
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v5
+; GFX6-NEXT:    v_mul_lo_u32 v7, s2, v6
 ; GFX6-NEXT:    s_subb_u32 s3, 0, s9
-; GFX6-NEXT:    v_mul_lo_u32 v8, s3, v3
+; GFX6-NEXT:    v_mul_lo_u32 v8, s3, v5
 ; GFX6-NEXT:    s_ashr_i32 s14, s11, 31
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
-; GFX6-NEXT:    v_mul_lo_u32 v7, s2, v3
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; GFX6-NEXT:    v_mul_lo_u32 v8, v3, v2
-; GFX6-NEXT:    v_mul_hi_u32 v9, v3, v7
-; GFX6-NEXT:    v_mul_hi_u32 v10, v3, v2
-; GFX6-NEXT:    v_mul_hi_u32 v11, v5, v2
-; GFX6-NEXT:    v_mul_lo_u32 v2, v5, v2
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GFX6-NEXT:    v_mul_lo_u32 v7, s2, v5
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GFX6-NEXT:    v_mul_lo_u32 v8, v5, v4
+; GFX6-NEXT:    v_mul_hi_u32 v9, v5, v7
+; GFX6-NEXT:    v_mul_hi_u32 v10, v5, v4
+; GFX6-NEXT:    v_mul_hi_u32 v11, v6, v4
+; GFX6-NEXT:    v_mul_lo_u32 v4, v6, v4
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v10, v5, v7
-; GFX6-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GFX6-NEXT:    v_mul_lo_u32 v10, v6, v7
+; GFX6-NEXT:    v_mul_hi_u32 v7, v6, v7
 ; GFX6-NEXT:    s_mov_b32 s15, s14
-; GFX6-NEXT:    v_xor_b32_e32 v0, s12, v0
+; GFX6-NEXT:    v_xor_b32_e32 v2, s12, v2
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v11, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v6, v8, vcc
-; GFX6-NEXT:    v_add_i32_e64 v2, s[0:1], v3, v2
-; GFX6-NEXT:    v_addc_u32_e64 v3, vcc, v5, v7, s[0:1]
-; GFX6-NEXT:    v_mul_lo_u32 v8, s2, v3
-; GFX6-NEXT:    v_mul_hi_u32 v9, s2, v2
-; GFX6-NEXT:    v_mul_lo_u32 v10, s3, v2
-; GFX6-NEXT:    v_xor_b32_e32 v1, s12, v1
+; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v11, v0, vcc
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v1, v8, vcc
+; GFX6-NEXT:    v_add_i32_e64 v4, s[0:1], v5, v4
+; GFX6-NEXT:    v_addc_u32_e64 v5, vcc, v6, v7, s[0:1]
+; GFX6-NEXT:    v_mul_lo_u32 v8, s2, v5
+; GFX6-NEXT:    v_mul_hi_u32 v9, s2, v4
+; GFX6-NEXT:    v_mul_lo_u32 v10, s3, v4
+; GFX6-NEXT:    v_xor_b32_e32 v3, s12, v3
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GFX6-NEXT:    v_mul_lo_u32 v9, s2, v2
+; GFX6-NEXT:    v_mul_lo_u32 v9, s2, v4
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GFX6-NEXT:    v_mul_lo_u32 v12, v2, v8
-; GFX6-NEXT:    v_mul_hi_u32 v13, v2, v9
-; GFX6-NEXT:    v_mul_hi_u32 v14, v2, v8
-; GFX6-NEXT:    v_mul_hi_u32 v11, v3, v9
-; GFX6-NEXT:    v_mul_lo_u32 v9, v3, v9
-; GFX6-NEXT:    v_mul_hi_u32 v10, v3, v8
+; GFX6-NEXT:    v_mul_lo_u32 v12, v4, v8
+; GFX6-NEXT:    v_mul_hi_u32 v13, v4, v9
+; GFX6-NEXT:    v_mul_hi_u32 v14, v4, v8
+; GFX6-NEXT:    v_mul_hi_u32 v11, v5, v9
+; GFX6-NEXT:    v_mul_lo_u32 v9, v5, v9
+; GFX6-NEXT:    v_mul_hi_u32 v10, v5, v8
 ; GFX6-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; GFX6-NEXT:    v_addc_u32_e32 v13, vcc, 0, v14, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v3, v3, v8
+; GFX6-NEXT:    v_mul_lo_u32 v5, v5, v8
 ; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
 ; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v13, v11, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v10, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v6, v8, vcc
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GFX6-NEXT:    v_addc_u32_e64 v5, vcc, v5, v8, s[0:1]
+; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v10, v0, vcc
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v1, v8, vcc
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GFX6-NEXT:    v_addc_u32_e64 v6, vcc, v6, v8, s[0:1]
 ; GFX6-NEXT:    s_add_u32 s0, s10, s14
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GFX6-NEXT:    s_addc_u32 s1, s11, s14
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
-; GFX6-NEXT:    v_mul_lo_u32 v5, s10, v3
-; GFX6-NEXT:    v_mul_hi_u32 v7, s10, v2
-; GFX6-NEXT:    v_mul_hi_u32 v9, s10, v3
-; GFX6-NEXT:    v_mul_hi_u32 v10, s11, v3
-; GFX6-NEXT:    v_mul_lo_u32 v3, s11, v3
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GFX6-NEXT:    v_mul_lo_u32 v6, s10, v5
+; GFX6-NEXT:    v_mul_hi_u32 v7, s10, v4
+; GFX6-NEXT:    v_mul_hi_u32 v9, s10, v5
+; GFX6-NEXT:    v_mul_hi_u32 v10, s11, v5
+; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v5
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v9, s11, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s11, v2
+; GFX6-NEXT:    v_mul_lo_u32 v9, s11, v4
+; GFX6-NEXT:    v_mul_hi_u32 v4, s11, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v8, s12
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v2, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v10, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v3, s8, v3
-; GFX6-NEXT:    v_mul_hi_u32 v4, s8, v2
-; GFX6-NEXT:    v_mul_lo_u32 v5, s9, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
-; GFX6-NEXT:    v_mul_lo_u32 v2, s8, v2
-; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s11, v3
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v7, v4, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v10, v0, vcc
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v1, v0, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v5, s8, v0
+; GFX6-NEXT:    v_mul_hi_u32 v6, s8, v4
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v4
+; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v3, v8, vcc
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v5
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_mul_lo_u32 v3, s8, v4
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s11, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s9
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s10, v2
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
 ; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
-; GFX6-NEXT:    v_subrev_i32_e64 v6, s[0:1], s8, v2
+; GFX6-NEXT:    v_subrev_i32_e64 v6, s[0:1], s8, v3
 ; GFX6-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1]
 ; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v7
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
@@ -14128,22 +14128,22 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[0:1]
 ; GFX6-NEXT:    v_mov_b32_e32 v7, s11
-; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
+; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v7, v2, vcc
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v2, s14, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v3, s14, v3
-; GFX6-NEXT:    v_mov_b32_e32 v4, s14
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s14, v2
-; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
+; GFX6-NEXT:    v_xor_b32_e32 v4, s14, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, s14
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s14, v3
+; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v4, v5, vcc
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -14169,246 +14169,246 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    s_subb_u32 s4, 0, s13
 ; GFX9-NEXT:    v_mac_f32_e32 v0, s16, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, s17, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, s18, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, s19, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v2, s8, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, s8, v0
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v6, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v4, s8, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, s8, v0
-; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v9, s8, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
+; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v2
+; GFX9-NEXT:    v_mul_hi_u32 v1, s8, v3
+; GFX9-NEXT:    v_mul_lo_u32 v5, s4, v3
+; GFX9-NEXT:    v_mul_lo_u32 v4, s8, v3
+; GFX9-NEXT:    v_add_u32_e32 v0, v1, v0
+; GFX9-NEXT:    v_add_u32_e32 v5, v0, v5
+; GFX9-NEXT:    v_mul_hi_u32 v1, v3, v4
+; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v5
+; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v5
+; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v5
+; GFX9-NEXT:    v_mul_lo_u32 v5, v2, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, v2, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, v2, v4
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v8, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v6, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v3, s[2:3], v3, v4
+; GFX9-NEXT:    v_addc_co_u32_e64 v4, vcc, v2, v5, s[2:3]
+; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v4
+; GFX9-NEXT:    v_mul_hi_u32 v7, s8, v3
+; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v3
+; GFX9-NEXT:    v_mul_lo_u32 v9, s8, v3
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX9-NEXT:    v_add_u32_e32 v4, v7, v4
-; GFX9-NEXT:    v_add_u32_e32 v4, v4, v8
-; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v9
-; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v9
-; GFX9-NEXT:    v_mul_lo_u32 v9, v2, v9
-; GFX9-NEXT:    v_mul_hi_u32 v7, v2, v4
+; GFX9-NEXT:    v_add_u32_e32 v6, v7, v6
+; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
+; GFX9-NEXT:    v_mul_lo_u32 v10, v3, v6
+; GFX9-NEXT:    v_mul_hi_u32 v11, v3, v9
+; GFX9-NEXT:    v_mul_hi_u32 v12, v3, v6
+; GFX9-NEXT:    v_mul_hi_u32 v8, v4, v9
+; GFX9-NEXT:    v_mul_lo_u32 v9, v4, v9
+; GFX9-NEXT:    v_mul_hi_u32 v7, v4, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
 ; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
+; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v4, vcc
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v0, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v1, v6, vcc
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s8, s5, 31
-; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
+; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v2, v6, s[2:3]
 ; GFX9-NEXT:    s_add_u32 s2, s4, s8
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
 ; GFX9-NEXT:    s_mov_b32 s9, s8
 ; GFX9-NEXT:    s_addc_u32 s3, s5, s8
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX9-NEXT:    s_xor_b64 s[14:15], s[2:3], s[8:9]
-; GFX9-NEXT:    v_mul_lo_u32 v2, s14, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s14, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s14, v1
-; GFX9-NEXT:    v_mul_hi_u32 v7, s15, v1
-; GFX9-NEXT:    v_mul_lo_u32 v1, s15, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s15, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s15, v0
+; GFX9-NEXT:    v_mul_lo_u32 v4, s14, v2
+; GFX9-NEXT:    v_mul_hi_u32 v5, s14, v3
+; GFX9-NEXT:    v_mul_hi_u32 v6, s14, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, s15, v2
+; GFX9-NEXT:    v_mul_lo_u32 v2, s15, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, s15, v3
+; GFX9-NEXT:    v_mul_hi_u32 v3, s15, v3
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v1, s12, v1
-; GFX9-NEXT:    v_mul_hi_u32 v2, s12, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s13, v0
-; GFX9-NEXT:    v_mul_lo_u32 v0, s12, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_sub_u32_e32 v2, s15, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s13
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s14, v0
-; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s12, v0
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[2:3], 0, v2, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v0, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v4, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v3
+; GFX9-NEXT:    v_mul_hi_u32 v4, s12, v2
+; GFX9-NEXT:    v_mul_lo_u32 v5, s13, v2
+; GFX9-NEXT:    v_mul_lo_u32 v2, s12, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
+; GFX9-NEXT:    v_sub_u32_e32 v4, s15, v3
+; GFX9-NEXT:    v_mov_b32_e32 v5, s13
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s14, v2
+; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v5, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s12, v2
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[2:3], 0, v4, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
-; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v6
+; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v5, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v7
-; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s12, v4
+; GFX9-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s12, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1]
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[0:1]
 ; GFX9-NEXT:    s_ashr_i32 s0, s11, 31
 ; GFX9-NEXT:    s_add_u32 s2, s10, s0
 ; GFX9-NEXT:    s_mov_b32 s1, s0
 ; GFX9-NEXT:    s_addc_u32 s3, s11, s0
-; GFX9-NEXT:    v_mov_b32_e32 v4, s15
+; GFX9-NEXT:    v_mov_b32_e32 v6, s15
 ; GFX9-NEXT:    s_xor_b64 s[10:11], s[2:3], s[0:1]
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s10
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v6, v3, vcc
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s10
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s11
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s13, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
-; GFX9-NEXT:    v_mac_f32_e32 v4, s16, v7
-; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v2
+; GFX9-NEXT:    v_mac_f32_e32 v6, s16, v7
+; GFX9-NEXT:    v_rcp_f32_e32 v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v8, v9, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v3, s17, v4
-; GFX9-NEXT:    v_mul_f32_e32 v4, s18, v3
-; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX9-NEXT:    v_mac_f32_e32 v3, s19, v4
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v5, s17, v6
+; GFX9-NEXT:    v_mul_f32_e32 v6, s18, v5
+; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
+; GFX9-NEXT:    v_mac_f32_e32 v5, s19, v6
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX9-NEXT:    s_sub_u32 s2, 0, s10
 ; GFX9-NEXT:    s_subb_u32 s3, 0, s11
-; GFX9-NEXT:    v_mul_hi_u32 v7, s2, v3
-; GFX9-NEXT:    v_mul_lo_u32 v8, s2, v4
-; GFX9-NEXT:    v_mul_lo_u32 v9, s3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v3
+; GFX9-NEXT:    v_mul_hi_u32 v7, s2, v5
+; GFX9-NEXT:    v_mul_lo_u32 v8, s2, v6
+; GFX9-NEXT:    v_mul_lo_u32 v9, s3, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v4, s2, v5
 ; GFX9-NEXT:    v_add_u32_e32 v7, v7, v8
 ; GFX9-NEXT:    v_add_u32_e32 v7, v7, v9
-; GFX9-NEXT:    v_mul_lo_u32 v8, v3, v7
-; GFX9-NEXT:    v_mul_hi_u32 v9, v3, v2
-; GFX9-NEXT:    v_mul_hi_u32 v10, v3, v7
-; GFX9-NEXT:    v_mul_hi_u32 v11, v4, v7
-; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v7
+; GFX9-NEXT:    v_mul_lo_u32 v8, v5, v7
+; GFX9-NEXT:    v_mul_hi_u32 v9, v5, v4
+; GFX9-NEXT:    v_mul_hi_u32 v10, v5, v7
+; GFX9-NEXT:    v_mul_hi_u32 v11, v6, v7
+; GFX9-NEXT:    v_mul_lo_u32 v7, v6, v7
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v10, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v10, v4, v2
-; GFX9-NEXT:    v_mul_hi_u32 v2, v4, v2
+; GFX9-NEXT:    v_mul_lo_u32 v10, v6, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, v6, v4
 ; GFX9-NEXT:    s_ashr_i32 s12, s7, 31
 ; GFX9-NEXT:    s_mov_b32 s13, s12
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v9, v2, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v5, v8, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e64 v3, vcc, v4, v7, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v8, s2, v3
-; GFX9-NEXT:    v_mul_hi_u32 v9, s2, v2
-; GFX9-NEXT:    v_mul_lo_u32 v10, s3, v2
-; GFX9-NEXT:    v_mul_lo_u32 v11, s2, v2
-; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v0, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v1, v8, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v4, s[0:1], v5, v4
+; GFX9-NEXT:    v_addc_co_u32_e64 v5, vcc, v6, v7, s[0:1]
+; GFX9-NEXT:    v_mul_lo_u32 v8, s2, v5
+; GFX9-NEXT:    v_mul_hi_u32 v9, s2, v4
+; GFX9-NEXT:    v_mul_lo_u32 v10, s3, v4
+; GFX9-NEXT:    v_mul_lo_u32 v11, s2, v4
+; GFX9-NEXT:    v_add_u32_e32 v6, v6, v7
 ; GFX9-NEXT:    v_add_u32_e32 v8, v9, v8
 ; GFX9-NEXT:    v_add_u32_e32 v8, v8, v10
-; GFX9-NEXT:    v_mul_lo_u32 v12, v2, v8
-; GFX9-NEXT:    v_mul_hi_u32 v13, v2, v11
-; GFX9-NEXT:    v_mul_hi_u32 v14, v2, v8
-; GFX9-NEXT:    v_mul_hi_u32 v10, v3, v11
-; GFX9-NEXT:    v_mul_lo_u32 v11, v3, v11
-; GFX9-NEXT:    v_mul_hi_u32 v9, v3, v8
+; GFX9-NEXT:    v_mul_lo_u32 v12, v4, v8
+; GFX9-NEXT:    v_mul_hi_u32 v13, v4, v11
+; GFX9-NEXT:    v_mul_hi_u32 v14, v4, v8
+; GFX9-NEXT:    v_mul_hi_u32 v10, v5, v11
+; GFX9-NEXT:    v_mul_lo_u32 v11, v5, v11
+; GFX9-NEXT:    v_mul_hi_u32 v9, v5, v8
 ; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v13, v12
 ; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v14, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v8
+; GFX9-NEXT:    v_mul_lo_u32 v5, v5, v8
 ; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v12, v11
 ; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v13, v10, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v9, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v10, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v5, v8, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1]
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v9, v0, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v10, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v1, v8, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v6, vcc, v6, v8, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s6, s12
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9-NEXT:    s_addc_u32 s1, s7, s12
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
-; GFX9-NEXT:    v_mul_lo_u32 v4, s6, v3
-; GFX9-NEXT:    v_mul_hi_u32 v7, s6, v2
-; GFX9-NEXT:    v_mul_hi_u32 v9, s6, v3
-; GFX9-NEXT:    v_mul_hi_u32 v10, s7, v3
-; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v7, v4
+; GFX9-NEXT:    v_mul_lo_u32 v6, s6, v5
+; GFX9-NEXT:    v_mul_hi_u32 v7, s6, v4
+; GFX9-NEXT:    v_mul_hi_u32 v9, s6, v5
+; GFX9-NEXT:    v_mul_hi_u32 v10, s7, v5
+; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v9, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v2
-; GFX9-NEXT:    v_mul_hi_u32 v2, s7, v2
-; GFX9-NEXT:    v_xor_b32_e32 v0, s8, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v2, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v10, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v3
-; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v2
-; GFX9-NEXT:    v_mul_lo_u32 v5, s11, v2
-; GFX9-NEXT:    v_mul_lo_u32 v2, s10, v2
+; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, s7, v4
+; GFX9-NEXT:    v_xor_b32_e32 v2, s8, v2
+; GFX9-NEXT:    v_xor_b32_e32 v3, s8, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v9
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v10, v0, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v1
+; GFX9-NEXT:    v_mul_hi_u32 v6, s10, v4
+; GFX9-NEXT:    v_mul_lo_u32 v7, s11, v4
+; GFX9-NEXT:    v_mul_lo_u32 v4, s10, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v8, s8
-; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s8, v0
-; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX9-NEXT:    v_sub_u32_e32 v4, s7, v3
-; GFX9-NEXT:    v_mov_b32_e32 v5, s11
-; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s6, v2
-; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v5, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s10, v2
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, s[2:3], 0, v4, s[0:1]
+; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s8, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v8, vcc
+; GFX9-NEXT:    v_add_u32_e32 v3, v6, v5
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v7
+; GFX9-NEXT:    v_sub_u32_e32 v5, s7, v3
+; GFX9-NEXT:    v_mov_b32_e32 v6, s11
+; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s6, v4
+; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s10, v4
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, s[2:3], 0, v5, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s11, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s10, v7
-; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v5, s[0:1]
+; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[2:3]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s11, v8
-; GFX9-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s10, v7
+; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s10, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[2:3]
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v7, v6, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v2, s12, v2
-; GFX9-NEXT:    v_xor_b32_e32 v3, s12, v3
-; GFX9-NEXT:    v_mov_b32_e32 v4, s12
-; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s12, v2
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v4, s12, v4
+; GFX9-NEXT:    v_xor_b32_e32 v5, s12, v3
+; GFX9-NEXT:    v_mov_b32_e32 v6, s12
+; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s12, v4
+; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v5, v6, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
+; GFX9-NEXT:    global_store_dwordx4 v0, v[1:4], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: srem_v2i64_pow2_shl_denom:
@@ -14462,186 +14462,186 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v2
-; GFX90A-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[0:1]
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s2, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v2, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
+; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v5, s[0:1]
+; GFX90A-NEXT:    v_mul_lo_u32 v6, s2, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, s2, v0
-; GFX90A-NEXT:    v_add_u32_e32 v5, v7, v5
+; GFX90A-NEXT:    v_add_u32_e32 v6, v7, v6
 ; GFX90A-NEXT:    v_mul_lo_u32 v7, s3, v0
-; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v7
+; GFX90A-NEXT:    v_add_u32_e32 v6, v6, v7
 ; GFX90A-NEXT:    v_mul_lo_u32 v8, s2, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v2, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v10, v2, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v12, v0, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v3, v8
+; GFX90A-NEXT:    v_mul_lo_u32 v10, v3, v8
+; GFX90A-NEXT:    v_mul_lo_u32 v12, v0, v6
 ; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v8
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v0, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v11, v0, v6
 ; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
 ; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v2, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v3, v6
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v2, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v7, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1]
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v6
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v2, v7, vcc
+; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v5
+; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v6, s[0:1]
 ; GFX90A-NEXT:    s_add_u32 s0, s4, s14
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX90A-NEXT:    s_addc_u32 s1, s5, s14
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], s[0:1], s[14:15]
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s4, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s4, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v2, s4, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s4, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v6, s4, v0
+; GFX90A-NEXT:    v_mul_hi_u32 v3, s4, v1
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, s5, v0
 ; GFX90A-NEXT:    v_mul_lo_u32 v0, s5, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s5, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v2, v7, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v4, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v5, v0
+; GFX90A-NEXT:    v_mul_hi_u32 v6, s5, v1
+; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v7, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, s5, v1
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v2, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, s12, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v2, s12, v0
-; GFX90A-NEXT:    v_add_u32_e32 v1, v2, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s13, v0
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX90A-NEXT:    v_mul_hi_u32 v3, s12, v0
+; GFX90A-NEXT:    v_add_u32_e32 v1, v3, v1
+; GFX90A-NEXT:    v_mul_lo_u32 v3, s13, v0
+; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
 ; GFX90A-NEXT:    v_mul_lo_u32 v0, s12, v0
-; GFX90A-NEXT:    v_sub_u32_e32 v2, s5, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_sub_u32_e32 v3, s5, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v5, s13
 ; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, s4, v0
-; GFX90A-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s12, v0
-; GFX90A-NEXT:    v_subbrev_co_u32_e64 v7, s[2:3], 0, v2, s[0:1]
+; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v5, vcc
+; GFX90A-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s12, v0
+; GFX90A-NEXT:    v_subbrev_co_u32_e64 v7, s[2:3], 0, v3, s[0:1]
 ; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v7
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
-; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v5
-; GFX90A-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1]
+; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v6
+; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v5, s[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
 ; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v7
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s12, v5
+; GFX90A-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s12, v6
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
-; GFX90A-NEXT:    v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1]
+; GFX90A-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
 ; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[0:1]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, s5
-; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v5, v1, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s[0:1]
+; GFX90A-NEXT:    v_mov_b32_e32 v6, s5
+; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v6, v1, vcc
 ; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
-; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
 ; GFX90A-NEXT:    s_ashr_i32 s0, s11, 31
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
 ; GFX90A-NEXT:    s_add_u32 s2, s10, s0
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
 ; GFX90A-NEXT:    s_mov_b32 s1, s0
 ; GFX90A-NEXT:    s_addc_u32 s3, s11, s0
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], s[2:3], s[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v2, s4
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s5
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s4
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s5
 ; GFX90A-NEXT:    v_xor_b32_e32 v0, s14, v0
 ; GFX90A-NEXT:    s_sub_u32 s2, 0, s4
 ; GFX90A-NEXT:    v_xor_b32_e32 v1, s14, v1
-; GFX90A-NEXT:    v_mac_f32_e32 v2, s16, v3
-; GFX90A-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v5, s14
+; GFX90A-NEXT:    v_mac_f32_e32 v3, s16, v5
+; GFX90A-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v6, s14
 ; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s14, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v2, s17, v2
-; GFX90A-NEXT:    v_mul_f32_e32 v3, s18, v2
-; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX90A-NEXT:    v_mac_f32_e32 v2, s19, v3
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX90A-NEXT:    v_mul_f32_e32 v3, s17, v3
+; GFX90A-NEXT:    v_mul_f32_e32 v5, s18, v3
+; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX90A-NEXT:    v_mac_f32_e32 v3, s19, v5
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX90A-NEXT:    s_subb_u32 s3, 0, s5
-; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s2, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v8, s2, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s3, v2
+; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v7, s2, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v8, s2, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v6, s3, v3
 ; GFX90A-NEXT:    v_add_u32_e32 v7, v7, v8
-; GFX90A-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v9, s2, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v8, v2, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v2, v9
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v2, v5
+; GFX90A-NEXT:    v_add_u32_e32 v6, v7, v6
+; GFX90A-NEXT:    v_mul_lo_u32 v9, s2, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v8, v3, v6
+; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v9
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v3, v6
 ; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v3, v9
-; GFX90A-NEXT:    v_mul_lo_u32 v9, v3, v9
+; GFX90A-NEXT:    v_mul_hi_u32 v11, v5, v9
+; GFX90A-NEXT:    v_mul_lo_u32 v9, v5, v9
 ; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v9
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v10, v5, v6
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v11, vcc
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v10, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v6, v8, vcc
-; GFX90A-NEXT:    v_add_co_u32_e64 v2, s[0:1], v2, v5
-; GFX90A-NEXT:    v_addc_co_u32_e64 v5, vcc, v3, v7, s[0:1]
-; GFX90A-NEXT:    v_mul_lo_u32 v8, s2, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v9, s2, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v6, v5, v6
+; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
+; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v2, v8, vcc
+; GFX90A-NEXT:    v_add_co_u32_e64 v3, s[0:1], v3, v6
+; GFX90A-NEXT:    v_addc_co_u32_e64 v6, vcc, v5, v7, s[0:1]
+; GFX90A-NEXT:    v_mul_lo_u32 v8, s2, v6
+; GFX90A-NEXT:    v_mul_hi_u32 v9, s2, v3
 ; GFX90A-NEXT:    v_add_u32_e32 v8, v9, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v9, s3, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v9, s3, v3
 ; GFX90A-NEXT:    v_add_u32_e32 v8, v8, v9
-; GFX90A-NEXT:    v_mul_lo_u32 v10, s2, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v5, v10
-; GFX90A-NEXT:    v_mul_lo_u32 v12, v5, v10
-; GFX90A-NEXT:    v_mul_lo_u32 v14, v2, v8
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v2, v10
-; GFX90A-NEXT:    v_mul_hi_u32 v13, v2, v8
+; GFX90A-NEXT:    v_mul_lo_u32 v10, s2, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v11, v6, v10
+; GFX90A-NEXT:    v_mul_lo_u32 v12, v6, v10
+; GFX90A-NEXT:    v_mul_lo_u32 v14, v3, v8
+; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v10
+; GFX90A-NEXT:    v_mul_hi_u32 v13, v3, v8
 ; GFX90A-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v14
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
 ; GFX90A-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v12
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v5, v8
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v6, v8
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v10, vcc, v13, v11, vcc
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v5, v8
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v10, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v6, v9, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v7
+; GFX90A-NEXT:    v_mul_lo_u32 v6, v6, v8
+; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, v10, v6
+; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v2, v9, vcc
+; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v7
 ; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
-; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v3, v8, s[0:1]
+; GFX90A-NEXT:    v_addc_co_u32_e64 v5, vcc, v5, v8, s[0:1]
 ; GFX90A-NEXT:    s_add_u32 s0, s6, s10
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
 ; GFX90A-NEXT:    s_mov_b32 s11, s10
 ; GFX90A-NEXT:    s_addc_u32 s1, s7, s10
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
 ; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
-; GFX90A-NEXT:    v_mul_lo_u32 v7, s6, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v8, s6, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v7, s6, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v8, s6, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v6, s6, v5
 ; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v9, s7, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s7, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v7, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v8, s7, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v9, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v4, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v9, s7, v3
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, s7, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s4, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s4, v2
-; GFX90A-NEXT:    v_add_u32_e32 v3, v5, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s5, v2
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v5
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v7, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v8, s7, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v9, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v8, v4, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s7, v5
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v6, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v2, s4, v2
-; GFX90A-NEXT:    v_sub_u32_e32 v5, s7, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v5, s4, v3
+; GFX90A-NEXT:    v_add_u32_e32 v2, v5, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s5, v3
+; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v3, s4, v3
+; GFX90A-NEXT:    v_sub_u32_e32 v5, s7, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v6, s5
-; GFX90A-NEXT:    v_sub_co_u32_e32 v2, vcc, s6, v2
+; GFX90A-NEXT:    v_sub_co_u32_e32 v3, vcc, s6, v3
 ; GFX90A-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s4, v2
+; GFX90A-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s4, v3
 ; GFX90A-NEXT:    v_subbrev_co_u32_e64 v8, s[2:3], 0, v5, s[0:1]
 ; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s5, v8
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
@@ -14655,22 +14655,22 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v6, v7, v6, s[0:1]
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, s7
-; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
+; GFX90A-NEXT:    v_subb_co_u32_e32 v2, vcc, v7, v2, vcc
+; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s5, v2
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
+; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s5, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s5, v2
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v2, s10, v2
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GFX90A-NEXT:    v_xor_b32_e32 v3, s10, v3
-; GFX90A-NEXT:    v_mov_b32_e32 v5, s10
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v2, vcc, s10, v2
-; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX90A-NEXT:    v_xor_b32_e32 v5, s10, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v6, s10
+; GFX90A-NEXT:    v_subrev_co_u32_e32 v2, vcc, s10, v3
+; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
 ; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y

diff  --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
index 4b7b604d03afe..87b3debfc7723 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -327,27 +327,27 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0
 define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
 ; SI-LABEL: test_copy_v4i8_x2_extra_use:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s14, 0
-; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_mov_b32 s15, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; SI-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
 ; SI-NEXT:    s_mov_b32 s16, 0xff00
 ; SI-NEXT:    s_movk_i32 s17, 0xff
-; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    s_mov_b32 s14, s10
-; SI-NEXT:    s_mov_b32 s8, s0
-; SI-NEXT:    s_mov_b32 s9, s1
-; SI-NEXT:    s_mov_b32 s12, s2
-; SI-NEXT:    s_mov_b32 s13, s3
-; SI-NEXT:    s_mov_b32 s6, s10
-; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s14, s2
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_mov_b32 s12, s6
+; SI-NEXT:    s_mov_b32 s13, s7
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 9, v0
 ; SI-NEXT:    v_and_b32_e32 v2, s16, v0
@@ -363,7 +363,7 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x9000000, v1
 ; SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_copy_v4i8_x2_extra_use:

diff  --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 5b2b8b0a7bffe..895cf653caf9b 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -906,20 +906,20 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
 define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
 ; SI-LABEL: load_v4i8_to_v4f32_2_uses:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    s_mov_b32 s3, s11
+; SI-NEXT:    s_mov_b32 s3, s7
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_movk_i32 s0, 0xff
-; SI-NEXT:    s_mov_b32 s6, s10
-; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    s_mov_b32 s10, s6
+; SI-NEXT:    s_mov_b32 s11, s7
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
@@ -929,7 +929,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; SI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v4
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 9, v4
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, s0, v4
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 9, v5
@@ -942,25 +942,25 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x9000000, v0
-; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: load_v4i8_to_v4f32_2_uses:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_mov_b32 s11, 0xf000
-; VI-NEXT:    s_mov_b32 s10, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    v_mov_b32_e32 v5, 9
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    s_mov_b32 s6, s10
-; VI-NEXT:    s_mov_b32 s7, s11
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
 ; VI-NEXT:    s_movk_i32 s0, 0x900
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
@@ -971,7 +971,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; VI-NEXT:    v_and_b32_e32 v7, 0xffffff00, v4
 ; VI-NEXT:    v_add_u16_e32 v8, 9, v4
 ; VI-NEXT:    v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; VI-NEXT:    s_nop 0
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v6
 ; VI-NEXT:    v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -980,7 +980,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; VI-NEXT:    v_add_u16_e32 v0, s0, v0
 ; VI-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses:

diff  --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index d16758d780914..061fb56be239f 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -1002,20 +1002,20 @@ define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(
 define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 ; SI-LABEL: fast_frem_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s8, s4
-; SI-NEXT:    s_mov_b32 s9, s5
-; SI-NEXT:    s_mov_b32 s4, s6
-; SI-NEXT:    s_mov_b32 s5, s7
-; SI-NEXT:    s_mov_b32 s6, s10
-; SI-NEXT:    s_mov_b32 s7, s11
-; SI-NEXT:    s_mov_b32 s2, s10
-; SI-NEXT:    s_mov_b32 s3, s11
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_mov_b32 s4, s8
+; SI-NEXT:    s_mov_b32 s5, s9
+; SI-NEXT:    s_mov_b32 s8, s10
+; SI-NEXT:    s_mov_b32 s9, s11
+; SI-NEXT:    s_mov_b32 s10, s6
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
 ; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -1029,7 +1029,7 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
 ; SI-NEXT:    v_bfe_u32 v6, v5, 20, 11
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 0xfffffc01, v6
 ; SI-NEXT:    s_mov_b32 s1, 0xfffff
-; SI-NEXT:    s_mov_b32 s0, s10
+; SI-NEXT:    s_mov_b32 s0, s6
 ; SI-NEXT:    v_lshr_b64 v[6:7], s[0:1], v8
 ; SI-NEXT:    v_not_b32_e32 v6, v6
 ; SI-NEXT:    v_and_b32_e32 v6, v4, v6
@@ -1043,7 +1043,7 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
 ; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
 ; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: fast_frem_f64:
@@ -1160,20 +1160,20 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
 define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 ; SI-LABEL: unsafe_frem_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s8, s4
-; SI-NEXT:    s_mov_b32 s9, s5
-; SI-NEXT:    s_mov_b32 s4, s6
-; SI-NEXT:    s_mov_b32 s5, s7
-; SI-NEXT:    s_mov_b32 s6, s10
-; SI-NEXT:    s_mov_b32 s7, s11
-; SI-NEXT:    s_mov_b32 s2, s10
-; SI-NEXT:    s_mov_b32 s3, s11
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_mov_b32 s4, s8
+; SI-NEXT:    s_mov_b32 s5, s9
+; SI-NEXT:    s_mov_b32 s8, s10
+; SI-NEXT:    s_mov_b32 s9, s11
+; SI-NEXT:    s_mov_b32 s10, s6
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
 ; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -1187,7 +1187,7 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add
 ; SI-NEXT:    v_bfe_u32 v6, v5, 20, 11
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 0xfffffc01, v6
 ; SI-NEXT:    s_mov_b32 s1, 0xfffff
-; SI-NEXT:    s_mov_b32 s0, s10
+; SI-NEXT:    s_mov_b32 s0, s6
 ; SI-NEXT:    v_lshr_b64 v[6:7], s[0:1], v8
 ; SI-NEXT:    v_not_b32_e32 v6, v6
 ; SI-NEXT:    v_and_b32_e32 v6, v4, v6
@@ -1201,7 +1201,7 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add
 ; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
 ; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: unsafe_frem_f64:

diff  --git a/llvm/test/CodeGen/AMDGPU/greedy-global-heuristic.mir b/llvm/test/CodeGen/AMDGPU/greedy-global-heuristic.mir
new file mode 100644
index 0000000000000..777a730f56752
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/greedy-global-heuristic.mir
@@ -0,0 +1,250 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=greedy -verify-machineinstrs -o - %s |FileCheck %s
+
+# Testcase is limited to 24 VGPRs. Only a maxiumum of 6 vreg_128s can
+# be allocated at the same time.
+
+# This testcase is intended to stress the heuristic in
+# RAGreedy::enqueue to switch from local to global. If an interval is
+# in one basic block, the usual preference is to allocate registers in
+# instruction order. If the estimated live range length is more than
+# twice the number of registers in the class, the global heuristic is
+# used which increases the priority of the longest live ranges. By
+# accounting for the number of reserved registers in vreg_128, the
+# heuristic changes end up avoiding a spill of %0.
+
+--- |
+
+  define void @use_global_assign() #0 {
+  entry:
+    unreachable
+  }
+
+  attributes #0 = { "amdgpu-waves-per-eu"="10,10" }
+
+...
+---
+name: use_global_assign
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vreg_128, preferred-register: '%0' }
+  - { id: 1, class: vreg_128, preferred-register: '%0' }
+  - { id: 2, class: vreg_128, preferred-register: '%0' }
+  - { id: 3, class: vreg_128, preferred-register: '%0' }
+  - { id: 4, class: vreg_128, preferred-register: '%0' }
+  - { id: 5, class: vreg_128, preferred-register: '%0' }
+  - { id: 6, class: vreg_128, preferred-register: '%0' }
+  - { id: 7, class: vreg_128, preferred-register: '%0' }
+  - { id: 8, class: vreg_128, preferred-register: '%0' }
+  - { id: 9, class: vreg_128, preferred-register: '%0' }
+  - { id: 10, class: vreg_128, preferred-register: '%0' }
+  - { id: 11, class: vreg_128, preferred-register: '%0' }
+  - { id: 12, class: vreg_128, preferred-register: '%0' }
+  - { id: 13, class: vreg_128, preferred-register: '%0' }
+  - { id: 14, class: vreg_128, preferred-register: '%0' }
+  - { id: 15, class: vreg_128, preferred-register: '%0' }
+
+machineFunctionInfo:
+  waveLimiter:     true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  ; CHECK-LABEL: name: use_global_assign
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   S_NOP 0, implicit-def %0
+  ; CHECK:   S_NOP 0, implicit-def %18
+  ; CHECK:   SI_SPILL_V128_SAVE %18, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
+  ; CHECK:   S_NOP 0, implicit-def %35
+  ; CHECK:   S_NOP 0, implicit-def %27
+  ; CHECK:   S_NOP 0, implicit-def %29
+  ; CHECK:   S_NOP 0, implicit-def %31
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK:   S_NOP 0, implicit %31
+  ; CHECK:   S_NOP 0, implicit %29
+  ; CHECK:   S_NOP 0, implicit %27
+  ; CHECK:   S_NOP 0, implicit %35
+  ; CHECK:   SI_SPILL_V128_SAVE %35, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
+  ; CHECK:   [[SI_SPILL_V128_RESTORE:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+  ; CHECK:   S_NOP 0, implicit [[SI_SPILL_V128_RESTORE]]
+  ; CHECK:   S_NOP 0, implicit %0
+  ; CHECK:   S_NOP 0, implicit-def %10
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0, implicit %0
+  ; CHECK:   S_NOP 0, implicit-def %33
+  ; CHECK:   SI_SPILL_V128_SAVE %33, %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5)
+  ; CHECK:   S_NOP 0, implicit %10
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0, implicit-def %40
+  ; CHECK:   SI_SPILL_V128_SAVE %40, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
+  ; CHECK:   S_NOP 0, implicit %33
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0, implicit-def %42
+  ; CHECK:   SI_SPILL_V128_SAVE %42, %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5)
+  ; CHECK:   S_NOP 0, implicit %40
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   S_NOP 0
+  ; CHECK:   [[COPY:%[0-9]+]]:vreg_128 = COPY %31
+  ; CHECK:   S_NOP 0, implicit %31
+  ; CHECK:   [[COPY1:%[0-9]+]]:vreg_128 = COPY %29
+  ; CHECK:   S_NOP 0, implicit %29
+  ; CHECK:   [[COPY2:%[0-9]+]]:vreg_128 = COPY %27
+  ; CHECK:   S_NOP 0, implicit %27
+  ; CHECK:   [[SI_SPILL_V128_RESTORE1:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
+  ; CHECK:   [[COPY3:%[0-9]+]]:vreg_128 = COPY [[SI_SPILL_V128_RESTORE1]]
+  ; CHECK:   S_NOP 0, implicit [[SI_SPILL_V128_RESTORE1]]
+  ; CHECK:   [[SI_SPILL_V128_RESTORE2:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+  ; CHECK:   S_NOP 0, implicit [[SI_SPILL_V128_RESTORE2]]
+  ; CHECK:   S_NOP 0, implicit %0
+  ; CHECK:   [[SI_SPILL_V128_RESTORE3:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK:   S_NOP 0, implicit [[SI_SPILL_V128_RESTORE3]]
+  ; CHECK:   [[SI_SPILL_V128_RESTORE4:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+  ; CHECK:   S_NOP 0, implicit [[SI_SPILL_V128_RESTORE4]]
+  ; CHECK:   [[SI_SPILL_V128_RESTORE5:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5)
+  ; CHECK:   S_NOP 0, implicit [[SI_SPILL_V128_RESTORE5]]
+  ; CHECK: bb.2:
+  ; CHECK:   S_NOP 0, implicit %0
+  ; CHECK:   [[SI_SPILL_V128_RESTORE6:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+  ; CHECK:   S_NOP 0, implicit [[SI_SPILL_V128_RESTORE6]]
+  ; CHECK:   S_NOP 0, implicit [[COPY3]]
+  ; CHECK:   S_NOP 0, implicit [[COPY2]]
+  ; CHECK:   S_NOP 0, implicit [[COPY1]]
+  ; CHECK:   S_NOP 0, implicit [[COPY]]
+  bb.0:
+    S_NOP 0, implicit-def %0:vreg_128
+    S_NOP 0, implicit-def %1:vreg_128
+    S_NOP 0, implicit-def %2:vreg_128
+    S_NOP 0, implicit-def %3:vreg_128
+    S_NOP 0, implicit-def %4:vreg_128
+    S_NOP 0, implicit-def %5:vreg_128
+
+  bb.1:
+    S_NOP 0, implicit %5
+    S_NOP 0, implicit %4
+    S_NOP 0, implicit %3
+    S_NOP 0, implicit %2
+    S_NOP 0, implicit %1
+    S_NOP 0, implicit %0
+    S_NOP 0, implicit-def %10:vreg_128
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0, implicit %0
+    S_NOP 0, implicit-def %11:vreg_128
+    S_NOP 0, implicit %10
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0, implicit-def %12:vreg_128
+    S_NOP 0, implicit %11
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0, implicit-def %13:vreg_128
+    S_NOP 0, implicit %12
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0, implicit %5
+    S_NOP 0, implicit %4
+    S_NOP 0, implicit %3
+    S_NOP 0, implicit %2
+    S_NOP 0, implicit %1
+    S_NOP 0, implicit %0
+    S_NOP 0, implicit %11
+    S_NOP 0, implicit %12
+    S_NOP 0, implicit %13
+
+  bb.2:
+    S_NOP 0, implicit %0
+    S_NOP 0, implicit %1
+    S_NOP 0, implicit %2
+    S_NOP 0, implicit %3
+    S_NOP 0, implicit %4
+    S_NOP 0, implicit %5
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 3e0fae8b29764..25575819c8026 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -1331,90 +1331,92 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(<16 x double> addrspa
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    s_add_u32 s2, s2, 16
 ; SI-NEXT:    s_addc_u32 s3, s3, 0
-; SI-NEXT:    v_mov_b32_e32 v5, s3
-; SI-NEXT:    v_mov_b32_e32 v4, s2
-; SI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; SI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; SI-NEXT:    v_mov_b32_e32 v2, s2
+; SI-NEXT:    v_mov_b32_e32 v3, s3
+; SI-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; SI-NEXT:    flat_load_dwordx4 v[0:3], v[2:3]
 ; SI-NEXT:    s_add_u32 s2, s0, 48
 ; SI-NEXT:    s_addc_u32 s3, s1, 0
-; SI-NEXT:    v_mov_b32_e32 v15, s3
-; SI-NEXT:    v_mov_b32_e32 v14, s2
+; SI-NEXT:    v_mov_b32_e32 v14, s3
+; SI-NEXT:    v_mov_b32_e32 v13, s2
 ; SI-NEXT:    s_add_u32 s2, s0, 32
 ; SI-NEXT:    s_addc_u32 s3, s1, 0
-; SI-NEXT:    v_mov_b32_e32 v17, s3
-; SI-NEXT:    v_mov_b32_e32 v16, s2
+; SI-NEXT:    v_mov_b32_e32 v16, s3
+; SI-NEXT:    v_mov_b32_e32 v15, s2
 ; SI-NEXT:    s_add_u32 s2, s0, 16
 ; SI-NEXT:    s_addc_u32 s3, s1, 0
-; SI-NEXT:    v_mov_b32_e32 v19, s3
-; SI-NEXT:    v_mov_b32_e32 v18, s2
+; SI-NEXT:    v_mov_b32_e32 v18, s3
+; SI-NEXT:    v_mov_b32_e32 v17, s2
 ; SI-NEXT:    s_add_u32 s2, s0, 0x70
 ; SI-NEXT:    s_addc_u32 s3, s1, 0
-; SI-NEXT:    v_mov_b32_e32 v13, s1
-; SI-NEXT:    v_mov_b32_e32 v12, s0
+; SI-NEXT:    v_mov_b32_e32 v12, s1
+; SI-NEXT:    v_mov_b32_e32 v11, s0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v5
-; SI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v3
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v0
+; SI-NEXT:    v_cvt_f64_f32_e32 v[7:8], v7
+; SI-NEXT:    v_cvt_f64_f32_e32 v[9:10], v9
+; SI-NEXT:    flat_store_dwordx4 v[13:14], v[7:10]
 ; SI-NEXT:    s_nop 0
-; SI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
-; SI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v3
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
-; SI-NEXT:    v_mov_b32_e32 v15, s3
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v0
-; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
-; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v0
+; SI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; SI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; SI-NEXT:    flat_store_dwordx4 v[15:16], v[6:9]
+; SI-NEXT:    s_nop 0
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v4
+; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; SI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
+; SI-NEXT:    flat_store_dwordx4 v[17:18], v[4:7]
+; SI-NEXT:    s_nop 0
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v1
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v9
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v10
-; SI-NEXT:    v_mov_b32_e32 v14, s2
+; SI-NEXT:    v_mov_b32_e32 v14, s3
+; SI-NEXT:    v_mov_b32_e32 v13, s2
 ; SI-NEXT:    s_add_u32 s2, s0, 0x60
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v4
 ; SI-NEXT:    s_addc_u32 s3, s1, 0
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; SI-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
-; SI-NEXT:    v_mov_b32_e32 v17, s3
-; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v7
-; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    flat_store_dwordx4 v[11:12], v[0:3]
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v5
-; SI-NEXT:    v_mov_b32_e32 v16, s2
+; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v19
+; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
+; SI-NEXT:    v_mov_b32_e32 v16, s3
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v20
+; SI-NEXT:    v_mov_b32_e32 v15, s2
 ; SI-NEXT:    s_add_u32 s2, s0, 0x50
 ; SI-NEXT:    s_addc_u32 s3, s1, 0
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v6
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
 ; SI-NEXT:    s_add_u32 s0, s0, 64
-; SI-NEXT:    flat_store_dwordx4 v[14:15], v[0:3]
+; SI-NEXT:    flat_store_dwordx4 v[13:14], v[0:3]
 ; SI-NEXT:    s_addc_u32 s1, s1, 0
-; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v21
-; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v7
-; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; SI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v12
-; SI-NEXT:    v_mov_b32_e32 v19, s3
+; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v7
+; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v12
+; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v21
+; SI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v19
+; SI-NEXT:    v_mov_b32_e32 v18, s3
 ; SI-NEXT:    v_mov_b32_e32 v13, s1
-; SI-NEXT:    v_mov_b32_e32 v18, s2
+; SI-NEXT:    v_mov_b32_e32 v17, s2
 ; SI-NEXT:    v_mov_b32_e32 v12, s0
-; SI-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
-; SI-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
+; SI-NEXT:    flat_store_dwordx4 v[15:16], v[8:11]
+; SI-NEXT:    flat_store_dwordx4 v[17:18], v[0:3]
 ; SI-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
 ; SI-NEXT:    s_endpgm
 ;
@@ -1426,84 +1428,85 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(<16 x double> addrspa
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    s_add_u32 s2, s2, 16
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v5, s3
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; VI-NEXT:    flat_load_dwordx4 v[0:3], v[2:3]
 ; VI-NEXT:    s_add_u32 s2, s0, 48
 ; VI-NEXT:    s_addc_u32 s3, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v15, s3
-; VI-NEXT:    v_mov_b32_e32 v14, s2
+; VI-NEXT:    v_mov_b32_e32 v14, s3
+; VI-NEXT:    v_mov_b32_e32 v13, s2
 ; VI-NEXT:    s_add_u32 s2, s0, 32
 ; VI-NEXT:    s_addc_u32 s3, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v17, s3
-; VI-NEXT:    v_mov_b32_e32 v16, s2
+; VI-NEXT:    v_mov_b32_e32 v16, s3
+; VI-NEXT:    v_mov_b32_e32 v15, s2
 ; VI-NEXT:    s_add_u32 s2, s0, 16
 ; VI-NEXT:    s_addc_u32 s3, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v19, s3
-; VI-NEXT:    v_mov_b32_e32 v18, s2
+; VI-NEXT:    v_mov_b32_e32 v18, s3
+; VI-NEXT:    v_mov_b32_e32 v17, s2
 ; VI-NEXT:    s_add_u32 s2, s0, 0x70
-; VI-NEXT:    v_mov_b32_e32 v13, s1
+; VI-NEXT:    v_mov_b32_e32 v12, s1
 ; VI-NEXT:    s_addc_u32 s3, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v12, s0
+; VI-NEXT:    v_mov_b32_e32 v11, s0
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_cvt_f32_f16_e32 v8, v7
+; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f64_f32_e32 v[7:8], v8
+; VI-NEXT:    v_cvt_f64_f32_e32 v[9:10], v9
+; VI-NEXT:    flat_store_dwordx4 v[13:14], v[7:10]
+; VI-NEXT:    s_nop 0
+; VI-NEXT:    v_cvt_f32_f16_e32 v7, v6
+; VI-NEXT:    v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_cvt_f32_f16_e32 v8, v3
-; VI-NEXT:    v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f32_f16_e32 v10, v0
+; VI-NEXT:    v_mov_b32_e32 v14, s3
+; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; VI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v3
-; VI-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
+; VI-NEXT:    flat_store_dwordx4 v[15:16], v[6:9]
 ; VI-NEXT:    s_nop 0
-; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v3
-; VI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
-; VI-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; VI-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
+; VI-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; VI-NEXT:    v_cvt_f32_f16_sdwa v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f32_f16_e32 v8, v4
+; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
+; VI-NEXT:    flat_store_dwordx4 v[17:18], v[4:7]
 ; VI-NEXT:    s_nop 0
-; VI-NEXT:    v_cvt_f32_f16_e32 v8, v0
-; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
-; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; VI-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_cvt_f32_f16_e32 v10, v4
-; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v8
-; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v9
-; VI-NEXT:    v_cvt_f32_f16_sdwa v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    v_cvt_f32_f16_e32 v4, v7
-; VI-NEXT:    v_cvt_f32_f16_sdwa v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    v_mov_b32_e32 v15, s3
-; VI-NEXT:    v_mov_b32_e32 v14, s2
+; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v8
+; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v9
+; VI-NEXT:    v_cvt_f32_f16_sdwa v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f32_f16_sdwa v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f32_f16_e32 v0, v3
+; VI-NEXT:    v_mov_b32_e32 v13, s2
 ; VI-NEXT:    s_add_u32 s2, s0, 0x60
-; VI-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
-; VI-NEXT:    v_cvt_f32_f16_e32 v8, v5
-; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
-; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v7
-; VI-NEXT:    v_cvt_f32_f16_e32 v7, v6
-; VI-NEXT:    v_cvt_f32_f16_sdwa v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    flat_store_dwordx4 v[11:12], v[4:7]
 ; VI-NEXT:    s_addc_u32 s3, s1, 0
-; VI-NEXT:    v_cvt_f32_f16_sdwa v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    v_mov_b32_e32 v17, s3
-; VI-NEXT:    v_mov_b32_e32 v16, s2
+; VI-NEXT:    v_cvt_f64_f32_e32 v[5:6], v8
+; VI-NEXT:    v_cvt_f32_f16_e32 v8, v2
+; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f32_f16_e32 v7, v1
+; VI-NEXT:    v_cvt_f32_f16_sdwa v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f64_f32_e32 v[3:4], v0
+; VI-NEXT:    v_mov_b32_e32 v16, s3
+; VI-NEXT:    v_mov_b32_e32 v15, s2
 ; VI-NEXT:    s_add_u32 s2, s0, 0x50
 ; VI-NEXT:    s_addc_u32 s3, s1, 0
-; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
-; VI-NEXT:    flat_store_dwordx4 v[14:15], v[0:3]
-; VI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v6
-; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v8
-; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v7
+; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v10
+; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; VI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
 ; VI-NEXT:    s_add_u32 s0, s0, 64
-; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v12
+; VI-NEXT:    flat_store_dwordx4 v[13:14], v[3:6]
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v18
-; VI-NEXT:    v_mov_b32_e32 v21, s3
+; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
+; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v12
+; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v17
+; VI-NEXT:    v_mov_b32_e32 v20, s3
 ; VI-NEXT:    v_mov_b32_e32 v13, s1
-; VI-NEXT:    v_mov_b32_e32 v20, s2
+; VI-NEXT:    v_mov_b32_e32 v19, s2
 ; VI-NEXT:    v_mov_b32_e32 v12, s0
-; VI-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
-; VI-NEXT:    flat_store_dwordx4 v[20:21], v[0:3]
-; VI-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
+; VI-NEXT:    flat_store_dwordx4 v[15:16], v[8:11]
+; VI-NEXT:    flat_store_dwordx4 v[19:20], v[4:7]
+; VI-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
 ; VI-NEXT:    s_endpgm
   %val = load <16 x half>, <16 x half> addrspace(1)* %in
   %cvt = fpext <16 x half> %val to <16 x double>

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index f6568f98485c1..e68b93bed96fe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -412,21 +412,21 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x19
 ; SI-NEXT:    s_mov_b32 s22, -1
-; SI-NEXT:    s_movk_i32 s23, 0xfc01
+; SI-NEXT:    s_movk_i32 s28, 0xfc01
 ; SI-NEXT:    s_mov_b32 s21, 0xfffff
 ; SI-NEXT:    s_mov_b32 s20, s22
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_bfe_u32 s2, s7, 0xb0014
-; SI-NEXT:    s_add_i32 s26, s2, s23
-; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s26
-; SI-NEXT:    s_brev_b32 s28, 1
+; SI-NEXT:    s_add_i32 s23, s2, s28
+; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s23
+; SI-NEXT:    s_brev_b32 s29, 1
 ; SI-NEXT:    s_andn2_b64 s[24:25], s[6:7], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s7, s28
-; SI-NEXT:    s_cmp_lt_i32 s26, 0
+; SI-NEXT:    s_and_b32 s2, s7, s29
+; SI-NEXT:    s_cmp_lt_i32 s23, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s25
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_cmp_gt_i32 s26, 51
+; SI-NEXT:    s_cmp_gt_i32 s23, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
@@ -437,15 +437,15 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[2:3]
 ; SI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
 ; SI-NEXT:    s_bfe_u32 s2, s5, 0xb0014
-; SI-NEXT:    s_add_i32 s24, s2, s23
-; SI-NEXT:    s_brev_b32 s29, -2
-; SI-NEXT:    v_mov_b32_e32 v14, 0x3ff00000
+; SI-NEXT:    s_add_i32 s24, s2, s28
+; SI-NEXT:    s_brev_b32 s23, -2
+; SI-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
 ; SI-NEXT:    v_mov_b32_e32 v4, s7
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
 ; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s24
-; SI-NEXT:    v_bfi_b32 v4, s29, v14, v4
+; SI-NEXT:    v_bfi_b32 v4, s23, v8, v4
 ; SI-NEXT:    s_andn2_b64 s[6:7], s[4:5], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s5, s28
+; SI-NEXT:    s_and_b32 s2, s5, s29
 ; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    s_cmp_lt_i32 s24, 0
@@ -464,13 +464,13 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[2:3]
 ; SI-NEXT:    v_add_f64 v[4:5], s[4:5], -v[0:1]
 ; SI-NEXT:    s_bfe_u32 s2, s11, 0xb0014
-; SI-NEXT:    s_add_i32 s6, s2, s23
+; SI-NEXT:    s_add_i32 s6, s2, s28
 ; SI-NEXT:    v_mov_b32_e32 v6, s5
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
 ; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s6
-; SI-NEXT:    v_bfi_b32 v6, s29, v14, v6
+; SI-NEXT:    v_bfi_b32 v6, s23, v8, v6
 ; SI-NEXT:    s_andn2_b64 s[4:5], s[10:11], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s11, s28
+; SI-NEXT:    s_and_b32 s2, s11, s29
 ; SI-NEXT:    v_cndmask_b32_e32 v5, 0, v6, vcc
 ; SI-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-NEXT:    s_cmp_lt_i32 s6, 0
@@ -489,14 +489,14 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[2:3]
 ; SI-NEXT:    v_add_f64 v[6:7], s[10:11], -v[4:5]
 ; SI-NEXT:    s_bfe_u32 s2, s9, 0xb0014
-; SI-NEXT:    s_add_i32 s6, s2, s23
-; SI-NEXT:    v_mov_b32_e32 v8, s11
+; SI-NEXT:    s_add_i32 s6, s2, s28
+; SI-NEXT:    v_mov_b32_e32 v9, s11
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
 ; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s6
-; SI-NEXT:    v_bfi_b32 v8, s29, v14, v8
+; SI-NEXT:    v_bfi_b32 v9, s23, v8, v9
 ; SI-NEXT:    s_andn2_b64 s[4:5], s[8:9], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s9, s28
-; SI-NEXT:    v_cndmask_b32_e32 v7, 0, v8, vcc
+; SI-NEXT:    s_and_b32 s2, s9, s29
+; SI-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
 ; SI-NEXT:    v_mov_b32_e32 v6, 0
 ; SI-NEXT:    s_cmp_lt_i32 s6, 0
 ; SI-NEXT:    v_add_f64 v[6:7], v[4:5], v[6:7]
@@ -510,120 +510,120 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v4, s4
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v8, s8
-; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v9, s8
+; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[2:3]
 ; SI-NEXT:    s_bfe_u32 s2, s15, 0xb0014
-; SI-NEXT:    v_add_f64 v[8:9], s[8:9], -v[4:5]
-; SI-NEXT:    s_add_i32 s4, s2, s23
+; SI-NEXT:    v_add_f64 v[9:10], s[8:9], -v[4:5]
+; SI-NEXT:    s_add_i32 s4, s2, s28
 ; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s4
-; SI-NEXT:    v_mov_b32_e32 v10, s9
-; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[8:9]|, 0.5
+; SI-NEXT:    v_mov_b32_e32 v11, s9
+; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[9:10]|, 0.5
 ; SI-NEXT:    s_andn2_b64 s[24:25], s[14:15], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s15, s28
-; SI-NEXT:    v_bfi_b32 v10, s29, v14, v10
+; SI-NEXT:    s_and_b32 s2, s15, s29
+; SI-NEXT:    v_bfi_b32 v11, s23, v8, v11
 ; SI-NEXT:    s_cmp_lt_i32 s4, 0
-; SI-NEXT:    v_cndmask_b32_e32 v9, 0, v10, vcc
-; SI-NEXT:    v_mov_b32_e32 v8, 0
+; SI-NEXT:    v_cndmask_b32_e32 v10, 0, v11, vcc
+; SI-NEXT:    v_mov_b32_e32 v9, 0
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_gt_i32 s4, 51
-; SI-NEXT:    v_add_f64 v[4:5], v[4:5], v[8:9]
-; SI-NEXT:    v_mov_b32_e32 v9, s2
+; SI-NEXT:    v_add_f64 v[4:5], v[4:5], v[9:10]
+; SI-NEXT:    v_mov_b32_e32 v10, s2
 ; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; SI-NEXT:    s_bfe_u32 s4, s13, 0xb0014
-; SI-NEXT:    s_add_i32 s6, s4, s23
+; SI-NEXT:    s_add_i32 s6, s4, s28
 ; SI-NEXT:    s_lshr_b64 s[4:5], s[20:21], s6
 ; SI-NEXT:    s_andn2_b64 s[26:27], s[12:13], s[4:5]
-; SI-NEXT:    s_and_b32 s4, s13, s28
-; SI-NEXT:    v_mov_b32_e32 v8, s25
+; SI-NEXT:    s_and_b32 s4, s13, s29
+; SI-NEXT:    v_mov_b32_e32 v9, s25
 ; SI-NEXT:    s_cmp_lt_i32 s6, 0
-; SI-NEXT:    v_cndmask_b32_e32 v15, v8, v9, vcc
-; SI-NEXT:    v_mov_b32_e32 v9, s4
+; SI-NEXT:    v_cndmask_b32_e32 v15, v9, v10, vcc
+; SI-NEXT:    v_mov_b32_e32 v10, s4
 ; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    s_cmp_gt_i32 s6, 51
 ; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; SI-NEXT:    s_bfe_u32 s8, s19, 0xb0014
-; SI-NEXT:    s_add_i32 s25, s8, s23
+; SI-NEXT:    s_add_i32 s25, s8, s28
 ; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], s25
 ; SI-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[8:9]
-; SI-NEXT:    s_and_b32 s8, s19, s28
-; SI-NEXT:    v_mov_b32_e32 v8, s27
+; SI-NEXT:    s_and_b32 s8, s19, s29
+; SI-NEXT:    v_mov_b32_e32 v9, s27
 ; SI-NEXT:    s_cmp_lt_i32 s25, 0
-; SI-NEXT:    v_cndmask_b32_e64 v17, v8, v9, s[4:5]
-; SI-NEXT:    v_mov_b32_e32 v8, s11
-; SI-NEXT:    v_mov_b32_e32 v9, s8
+; SI-NEXT:    v_cndmask_b32_e64 v17, v9, v10, s[4:5]
+; SI-NEXT:    v_mov_b32_e32 v9, s11
+; SI-NEXT:    v_mov_b32_e32 v10, s8
 ; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; SI-NEXT:    s_cmp_gt_i32 s25, 51
-; SI-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[8:9]
-; SI-NEXT:    v_mov_b32_e32 v9, s19
-; SI-NEXT:    v_mov_b32_e32 v10, s10
+; SI-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v10, s19
+; SI-NEXT:    v_mov_b32_e32 v11, s10
 ; SI-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v9, v8, v9, s[10:11]
-; SI-NEXT:    v_cndmask_b32_e64 v8, v10, 0, s[8:9]
-; SI-NEXT:    v_mov_b32_e32 v10, s18
+; SI-NEXT:    v_cndmask_b32_e64 v10, v9, v10, s[10:11]
+; SI-NEXT:    v_cndmask_b32_e64 v9, v11, 0, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, s18
 ; SI-NEXT:    s_bfe_u32 s8, s17, 0xb0014
-; SI-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[10:11]
-; SI-NEXT:    s_add_i32 s10, s8, s23
+; SI-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[10:11]
+; SI-NEXT:    s_add_i32 s10, s8, s28
 ; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], s10
 ; SI-NEXT:    s_andn2_b64 s[20:21], s[16:17], s[8:9]
-; SI-NEXT:    s_and_b32 s8, s17, s28
+; SI-NEXT:    s_and_b32 s8, s17, s29
 ; SI-NEXT:    s_cmp_lt_i32 s10, 0
-; SI-NEXT:    v_mov_b32_e32 v10, s21
-; SI-NEXT:    v_mov_b32_e32 v11, s8
+; SI-NEXT:    v_mov_b32_e32 v11, s21
+; SI-NEXT:    v_mov_b32_e32 v12, s8
 ; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; SI-NEXT:    s_cmp_gt_i32 s10, 51
-; SI-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[8:9]
-; SI-NEXT:    v_mov_b32_e32 v11, s17
+; SI-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v12, s17
 ; SI-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v13, v10, v11, s[10:11]
-; SI-NEXT:    v_mov_b32_e32 v10, s20
-; SI-NEXT:    v_cndmask_b32_e64 v10, v10, 0, s[8:9]
-; SI-NEXT:    v_mov_b32_e32 v11, s16
-; SI-NEXT:    v_cndmask_b32_e64 v12, v10, v11, s[10:11]
-; SI-NEXT:    v_add_f64 v[10:11], s[16:17], -v[12:13]
+; SI-NEXT:    v_cndmask_b32_e64 v14, v11, v12, s[10:11]
+; SI-NEXT:    v_mov_b32_e32 v11, s20
+; SI-NEXT:    v_cndmask_b32_e64 v11, v11, 0, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v12, s16
+; SI-NEXT:    v_cndmask_b32_e64 v13, v11, v12, s[10:11]
+; SI-NEXT:    v_add_f64 v[11:12], s[16:17], -v[13:14]
 ; SI-NEXT:    v_mov_b32_e32 v19, s17
-; SI-NEXT:    v_cmp_ge_f64_e64 s[8:9], |v[10:11]|, 0.5
-; SI-NEXT:    v_mov_b32_e32 v10, s19
-; SI-NEXT:    v_bfi_b32 v20, s29, v14, v10
-; SI-NEXT:    v_add_f64 v[10:11], s[18:19], -v[8:9]
-; SI-NEXT:    v_bfi_b32 v19, s29, v14, v19
-; SI-NEXT:    v_cmp_ge_f64_e64 s[10:11], |v[10:11]|, 0.5
-; SI-NEXT:    v_mov_b32_e32 v10, 0
-; SI-NEXT:    v_cndmask_b32_e64 v11, 0, v20, s[10:11]
-; SI-NEXT:    v_add_f64 v[10:11], v[8:9], v[10:11]
-; SI-NEXT:    v_cndmask_b32_e64 v9, 0, v19, s[8:9]
-; SI-NEXT:    v_mov_b32_e32 v8, 0
+; SI-NEXT:    v_cmp_ge_f64_e64 s[8:9], |v[11:12]|, 0.5
+; SI-NEXT:    v_mov_b32_e32 v11, s19
+; SI-NEXT:    v_bfi_b32 v20, s23, v8, v11
+; SI-NEXT:    v_add_f64 v[11:12], s[18:19], -v[9:10]
+; SI-NEXT:    v_bfi_b32 v19, s23, v8, v19
+; SI-NEXT:    v_cmp_ge_f64_e64 s[10:11], |v[11:12]|, 0.5
+; SI-NEXT:    v_mov_b32_e32 v11, 0
+; SI-NEXT:    v_cndmask_b32_e64 v12, 0, v20, s[10:11]
+; SI-NEXT:    v_add_f64 v[11:12], v[9:10], v[11:12]
+; SI-NEXT:    v_cndmask_b32_e64 v10, 0, v19, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v9, 0
 ; SI-NEXT:    v_mov_b32_e32 v16, s15
-; SI-NEXT:    v_add_f64 v[8:9], v[12:13], v[8:9]
-; SI-NEXT:    v_mov_b32_e32 v12, s24
-; SI-NEXT:    v_cndmask_b32_e64 v13, v15, v16, s[2:3]
-; SI-NEXT:    v_cndmask_b32_e64 v12, v12, 0, vcc
+; SI-NEXT:    v_add_f64 v[9:10], v[13:14], v[9:10]
+; SI-NEXT:    v_mov_b32_e32 v13, s24
+; SI-NEXT:    v_cndmask_b32_e64 v14, v15, v16, s[2:3]
+; SI-NEXT:    v_cndmask_b32_e64 v13, v13, 0, vcc
 ; SI-NEXT:    v_mov_b32_e32 v15, s14
-; SI-NEXT:    v_cndmask_b32_e64 v12, v12, v15, s[2:3]
+; SI-NEXT:    v_cndmask_b32_e64 v13, v13, v15, s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v15, s15
-; SI-NEXT:    v_bfi_b32 v19, s29, v14, v15
+; SI-NEXT:    v_bfi_b32 v19, s23, v8, v15
 ; SI-NEXT:    v_mov_b32_e32 v15, s26
 ; SI-NEXT:    v_mov_b32_e32 v18, s13
 ; SI-NEXT:    v_cndmask_b32_e64 v15, v15, 0, s[4:5]
 ; SI-NEXT:    v_mov_b32_e32 v16, s12
-; SI-NEXT:    v_cndmask_b32_e64 v17, v17, v18, s[6:7]
-; SI-NEXT:    v_cndmask_b32_e64 v16, v15, v16, s[6:7]
+; SI-NEXT:    v_cndmask_b32_e64 v18, v17, v18, s[6:7]
+; SI-NEXT:    v_cndmask_b32_e64 v17, v15, v16, s[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v15, s13
-; SI-NEXT:    v_bfi_b32 v18, s29, v14, v15
-; SI-NEXT:    v_add_f64 v[14:15], s[12:13], -v[16:17]
+; SI-NEXT:    v_bfi_b32 v8, s23, v8, v15
+; SI-NEXT:    v_add_f64 v[15:16], s[12:13], -v[17:18]
 ; SI-NEXT:    s_load_dwordx2 s[20:21], s[0:1], 0x9
-; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5
-; SI-NEXT:    v_add_f64 v[14:15], s[14:15], -v[12:13]
+; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[15:16]|, 0.5
+; SI-NEXT:    v_add_f64 v[15:16], s[14:15], -v[13:14]
 ; SI-NEXT:    s_mov_b32 s23, 0xf000
-; SI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5
-; SI-NEXT:    v_mov_b32_e32 v14, 0
-; SI-NEXT:    v_cndmask_b32_e64 v15, 0, v19, s[0:1]
-; SI-NEXT:    v_add_f64 v[14:15], v[12:13], v[14:15]
-; SI-NEXT:    v_cndmask_b32_e32 v13, 0, v18, vcc
-; SI-NEXT:    v_mov_b32_e32 v12, 0
-; SI-NEXT:    v_add_f64 v[12:13], v[16:17], v[12:13]
+; SI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[15:16]|, 0.5
+; SI-NEXT:    v_mov_b32_e32 v15, 0
+; SI-NEXT:    v_cndmask_b32_e64 v16, 0, v19, s[0:1]
+; SI-NEXT:    v_add_f64 v[15:16], v[13:14], v[15:16]
+; SI-NEXT:    v_cndmask_b32_e32 v14, 0, v8, vcc
+; SI-NEXT:    v_mov_b32_e32 v13, 0
+; SI-NEXT:    v_add_f64 v[13:14], v[17:18], v[13:14]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[20:23], 0 offset:48
-; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[20:23], 0 offset:32
+; SI-NEXT:    buffer_store_dwordx4 v[9:12], off, s[20:23], 0 offset:48
+; SI-NEXT:    buffer_store_dwordx4 v[13:16], off, s[20:23], 0 offset:32
 ; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[20:23], 0
 ; SI-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index f1b7373814c0d..5a3c20f612f0e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -1669,52 +1669,52 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(
 define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v16i16_to_v16i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, -1
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s12, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s13, s1, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s14, s0, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s15, s3, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s16, s2, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s17, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s1, s1, s12
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s0, s0, s12
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s3, s3, s12
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s2, s2, s12
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s13, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s14, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s15, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s16, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s17, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s10, 16
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, s12
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, s12
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, s12
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, s12
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, s12
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, s12
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, s12
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s11
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s19
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s18
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s17
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s13
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i32:
@@ -1888,51 +1888,51 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspa
 define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_sextload_v16i16_to_v16i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, -1
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s12, s1, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s13, s0, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s1, s1
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s0, s0
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s14, s3, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s15, s2, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s3, s3
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s2, s2
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s16, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s17, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s12, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s13, s4, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s5, s5
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s18, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s19, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s4, s4
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s14, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s15, s6, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s7, s7
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s6, s6
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s4, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s16, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s17, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s9, s9
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s18, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s19, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s11, s11
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s10, s10
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s8, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s19
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s11
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s18
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s17
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s15
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s14
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s13
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s12
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i32:
@@ -2700,88 +2700,88 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspa
 ;
 ; GCN-NOHSA-VI-LABEL: constant_sextload_v32i16_to_v32i32:
 ; GCN-NOHSA-VI:       ; %bb.0:
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s19, 0xf000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s18, -1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[4:19], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s16, s0
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s17, s1
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s35, s19, 16
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s36, s18, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s19, s19
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s18, s18
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s33, s17, 16
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s34, s16, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s17, s17
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s16, s16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s18
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s36
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s19
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s35
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s30, s15, 16
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s31, s14, 16
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s35, s15, 16
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s36, s14, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s15, s15
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s14, s14
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s28, s13, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s34
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s17
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s33
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s29, s12, 16
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s33, s13, 16
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s34, s12, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s13, s13
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s12, s12
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s26, s11, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s31
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s36
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s15
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s30
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s27, s10, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s35
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s30, s11, 16
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s31, s10, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s11, s11
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s10, s10
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s24, s9, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s28, s9, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s29
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s13
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s28
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s25, s8, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s33
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s29, s8, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s9, s9
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s8, s8
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s22, s7, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s26, s7, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s27
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s31
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s11
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s26
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s23, s6, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s30
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s27, s6, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s7, s7
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s6, s6
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s20, s5, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s24, s5, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s25
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s29
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s24
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s21, s4, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s28
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s25, s4, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s4, s4
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s22, s3, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s27
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s26
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s23, s2, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s3, s3
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s2, s2
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s20, s1, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s25
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s24
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s21, s0, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s1, s1
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s0, s0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s22
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s21
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s20
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: constant_sextload_v32i16_to_v32i32:
@@ -3066,9 +3066,10 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x0
-; GCN-HSA-NEXT:    s_mov_b32 s37, 0xffff
+; GCN-HSA-NEXT:    s_mov_b32 s53, 0xffff
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
+; GCN-HSA-NEXT:    s_load_dwordx16 s[36:51], s[18:19], 0x10
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s20, s1, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s21, s0, 16
@@ -3085,196 +3086,196 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    s_lshr_b32 s33, s13, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s34, s12, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s35, s15, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s36, s14, 16
-; GCN-HSA-NEXT:    s_and_b32 s38, s1, s37
-; GCN-HSA-NEXT:    s_and_b32 s39, s0, s37
-; GCN-HSA-NEXT:    s_and_b32 s40, s3, s37
-; GCN-HSA-NEXT:    s_and_b32 s41, s2, s37
-; GCN-HSA-NEXT:    s_and_b32 s42, s5, s37
-; GCN-HSA-NEXT:    s_and_b32 s43, s4, s37
-; GCN-HSA-NEXT:    s_and_b32 s44, s7, s37
-; GCN-HSA-NEXT:    s_and_b32 s45, s6, s37
-; GCN-HSA-NEXT:    s_and_b32 s46, s9, s37
-; GCN-HSA-NEXT:    s_and_b32 s47, s8, s37
-; GCN-HSA-NEXT:    s_and_b32 s48, s11, s37
-; GCN-HSA-NEXT:    s_and_b32 s49, s10, s37
-; GCN-HSA-NEXT:    s_and_b32 s50, s13, s37
-; GCN-HSA-NEXT:    s_and_b32 s51, s12, s37
-; GCN-HSA-NEXT:    s_and_b32 s52, s15, s37
-; GCN-HSA-NEXT:    s_and_b32 s53, s14, s37
-; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x10
-; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_and_b32 s18, s1, s37
-; GCN-HSA-NEXT:    s_and_b32 s19, s0, s37
-; GCN-HSA-NEXT:    s_and_b32 s54, s3, s37
-; GCN-HSA-NEXT:    s_and_b32 s55, s2, s37
-; GCN-HSA-NEXT:    s_and_b32 s56, s5, s37
-; GCN-HSA-NEXT:    s_and_b32 s57, s4, s37
-; GCN-HSA-NEXT:    s_and_b32 s58, s7, s37
-; GCN-HSA-NEXT:    s_and_b32 s59, s6, s37
-; GCN-HSA-NEXT:    s_and_b32 s60, s9, s37
-; GCN-HSA-NEXT:    s_and_b32 s61, s8, s37
-; GCN-HSA-NEXT:    s_and_b32 s62, s11, s37
-; GCN-HSA-NEXT:    s_and_b32 s63, s10, s37
-; GCN-HSA-NEXT:    s_and_b32 s64, s13, s37
-; GCN-HSA-NEXT:    s_and_b32 s65, s12, s37
-; GCN-HSA-NEXT:    s_and_b32 s66, s15, s37
-; GCN-HSA-NEXT:    s_and_b32 s37, s14, s37
-; GCN-HSA-NEXT:    s_lshr_b32 s67, s1, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s68, s0, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s3, s3, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s2, s2, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s5, s5, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s4, s4, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s7, s7, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s6, s6, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s9, s9, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s8, s8, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s11, s11, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s10, s10, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s13, s13, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s12, s12, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s15, s15, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s14, s14, 16
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xf0
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xe0
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xd0
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xc0
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xb0
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v31, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v30, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xa0
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x90
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s65
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s64
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s13
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    s_lshr_b32 s52, s14, 16
+; GCN-HSA-NEXT:    s_and_b32 s1, s1, s53
+; GCN-HSA-NEXT:    s_and_b32 s0, s0, s53
+; GCN-HSA-NEXT:    s_and_b32 s3, s3, s53
+; GCN-HSA-NEXT:    s_and_b32 s2, s2, s53
+; GCN-HSA-NEXT:    s_and_b32 s5, s5, s53
+; GCN-HSA-NEXT:    s_and_b32 s4, s4, s53
+; GCN-HSA-NEXT:    s_and_b32 s54, s7, s53
+; GCN-HSA-NEXT:    s_and_b32 s55, s6, s53
+; GCN-HSA-NEXT:    s_and_b32 s9, s9, s53
+; GCN-HSA-NEXT:    s_and_b32 s8, s8, s53
+; GCN-HSA-NEXT:    s_and_b32 s11, s11, s53
+; GCN-HSA-NEXT:    s_and_b32 s10, s10, s53
+; GCN-HSA-NEXT:    s_and_b32 s13, s13, s53
+; GCN-HSA-NEXT:    s_and_b32 s12, s12, s53
+; GCN-HSA-NEXT:    s_and_b32 s15, s15, s53
+; GCN-HSA-NEXT:    s_and_b32 s14, s14, s53
+; GCN-HSA-NEXT:    s_and_b32 s18, s37, s53
+; GCN-HSA-NEXT:    s_and_b32 s19, s36, s53
+; GCN-HSA-NEXT:    s_and_b32 s56, s39, s53
+; GCN-HSA-NEXT:    s_and_b32 s57, s38, s53
+; GCN-HSA-NEXT:    s_and_b32 s58, s41, s53
+; GCN-HSA-NEXT:    s_and_b32 s59, s40, s53
+; GCN-HSA-NEXT:    s_and_b32 s60, s43, s53
+; GCN-HSA-NEXT:    s_and_b32 s61, s42, s53
+; GCN-HSA-NEXT:    s_and_b32 s62, s45, s53
+; GCN-HSA-NEXT:    s_and_b32 s63, s44, s53
+; GCN-HSA-NEXT:    s_and_b32 s64, s47, s53
+; GCN-HSA-NEXT:    s_and_b32 s65, s46, s53
+; GCN-HSA-NEXT:    s_and_b32 s66, s49, s53
+; GCN-HSA-NEXT:    s_and_b32 s67, s48, s53
+; GCN-HSA-NEXT:    s_and_b32 s68, s51, s53
+; GCN-HSA-NEXT:    s_and_b32 s53, s50, s53
+; GCN-HSA-NEXT:    s_lshr_b32 s37, s37, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s36, s36, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s39, s39, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s38, s38, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s41, s41, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s40, s40, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s43, s43, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s42, s42, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s45, s45, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s44, s44, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s47, s47, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s46, s46, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s49, s49, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s48, s48, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s51, s51, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s50, s50, 16
+; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0xf0
+; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s6
+; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0xe0
+; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s6
+; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0xd0
+; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s6
+; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0xc0
+; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s6
+; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0xb0
+; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v31, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v30, s6
+; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0xa0
+; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s6
+; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0x90
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s67
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s66
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s49
+; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x80
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v34, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x70
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s59
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s58
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s7
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s6
+; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0x80
+; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v34, s6
+; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0x70
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s61
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s42
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s60
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s43
+; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s37
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x60
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s66
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s63
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s10
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s62
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s11
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s61
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s60
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s57
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s53
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s6
+; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0x60
+; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s68
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s51
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s65
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s46
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s64
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s47
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s63
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s44
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s62
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s45
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s59
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s55
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s56
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s54
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s40
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s57
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s58
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s41
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s38
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s56
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s19
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s68
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s39
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s36
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s6
+; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0x50
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s18
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s53
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s14
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[12:15]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s67
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s51
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s36
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s52
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s37
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s52
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s15
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s35
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s34
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[20:23]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s13
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s33
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[34:35], v[4:7]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 64
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s49
+; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s31
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s11
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s30
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s7
+; GCN-HSA-NEXT:    s_add_u32 s6, s16, 64
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s47
+; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s29
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s46
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s28
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s7
+; GCN-HSA-NEXT:    s_add_u32 s6, s16, 48
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 32
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s45
+; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s55
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s27
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s44
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s54
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s26
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s7
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s43
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT:    s_add_u32 s4, s16, 32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-HSA-NEXT:    s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s25
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s42
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s24
-; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s41
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s16, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-HSA-NEXT:    s_addc_u32 s3, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s23
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s40
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s22
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s39
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s21
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s38
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s20
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s17
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -3283,63 +3284,63 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-VI-LABEL: constant_zextload_v64i16_to_v64i32:
 ; GCN-NOHSA-VI:       ; %bb.0:
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s20, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s26, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x40
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[36:51], s[18:19], 0x0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s53, s1, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s53, s1, s26
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s18, s37, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s19, s37, s20
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s21, s36, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s22, s36, s20
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s23, s39, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s24, s39, s20
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s25, s38, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s26, s38, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s19, s37, s26
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s20, s36, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s21, s36, s26
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s22, s39, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s23, s39, s26
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s24, s38, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s25, s38, s26
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s27, s41, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s28, s41, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s28, s41, s26
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s29, s40, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s30, s40, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s30, s40, s26
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s31, s43, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s33, s43, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s33, s43, s26
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s34, s42, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s35, s42, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s35, s42, s26
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s36, s45, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s37, s45, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s37, s45, s26
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s38, s44, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s39, s44, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s39, s44, s26
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s40, s47, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s41, s47, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s41, s47, s26
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s42, s46, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s43, s46, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s43, s46, s26
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s44, s49, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s45, s49, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s45, s49, s26
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s46, s48, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s47, s48, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s47, s48, s26
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s48, s51, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s49, s51, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s49, s51, s26
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s51, s50, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s50, s50, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s55, s0, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s57, s3, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s59, s2, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s50, s50, s26
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s55, s0, s26
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s57, s3, s26
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s59, s2, s26
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s60, s5, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s5, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s5, s26
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s61, s4, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, s26
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s62, s7, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s7, s7, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s7, s7, s26
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s63, s6, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s6, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s64, s9, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s65, s8, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s66, s11, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s67, s10, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s68, s13, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s69, s12, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s70, s15, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s20, s14, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s6, s26
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s64, s9, s26
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s65, s8, s26
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s66, s11, s26
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s67, s10, s26
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s68, s13, s26
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s69, s12, s26
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s70, s15, s26
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s26, s14, s26
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s15, s15, 16
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s14, s14, 16
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s52, s1, 16
@@ -3352,7 +3353,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s16
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s17
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s26
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s14
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s70
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s15
@@ -3439,14 +3440,14 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s27
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s26
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s25
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s24
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s23
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s25
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s24
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s23
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s22
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s22
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s21
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s20
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s19
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s18
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
@@ -3468,19 +3469,19 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T56.XYZW, T58.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T52.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T53.XYZW, T55.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T37.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T48.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T38.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T40.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T46.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T39.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T43.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T36.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T41.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T43.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T36.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 22:
 ; EG-NEXT:     VTX_READ_128 T36.XYZW, T35.X, 0, #1
-; EG-NEXT:     VTX_READ_128 T37.XYZW, T35.X, 48, #1
-; EG-NEXT:     VTX_READ_128 T38.XYZW, T35.X, 32, #1
-; EG-NEXT:     VTX_READ_128 T39.XYZW, T35.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T39.XYZW, T35.X, 48, #1
+; EG-NEXT:     VTX_READ_128 T40.XYZW, T35.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T41.XYZW, T35.X, 16, #1
 ; EG-NEXT:    Fetch clause starting at 30:
 ; EG-NEXT:     VTX_READ_128 T49.XYZW, T35.X, 112, #1
 ; EG-NEXT:     VTX_READ_128 T50.XYZW, T35.X, 96, #1
@@ -3489,74 +3490,74 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; EG-NEXT:    ALU clause starting at 38:
 ; EG-NEXT:     MOV * T35.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 39:
-; EG-NEXT:     LSHR * T40.W, T36.Y, literal.x,
+; EG-NEXT:     LSHR * T37.W, T36.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T40.Z, T36.Y, literal.x,
+; EG-NEXT:     AND_INT * T37.Z, T36.Y, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHR T40.Y, T36.X, literal.x,
-; EG-NEXT:     LSHR * T41.W, T36.W, literal.x,
+; EG-NEXT:     LSHR T37.Y, T36.X, literal.x,
+; EG-NEXT:     LSHR * T38.W, T36.W, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T40.X, T36.X, literal.x,
-; EG-NEXT:     AND_INT T41.Z, T36.W, literal.x,
+; EG-NEXT:     AND_INT T37.X, T36.X, literal.x,
+; EG-NEXT:     AND_INT T38.Z, T36.W, literal.x,
 ; EG-NEXT:     LSHR * T36.X, KC0[2].Y, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
-; EG-NEXT:     LSHR T41.Y, T36.Z, literal.x,
-; EG-NEXT:     LSHR * T42.W, T39.Y, literal.x,
+; EG-NEXT:     LSHR T38.Y, T36.Z, literal.x,
+; EG-NEXT:     LSHR * T42.W, T41.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T41.X, T36.Z, literal.x,
-; EG-NEXT:     AND_INT T42.Z, T39.Y, literal.x,
+; EG-NEXT:     AND_INT T38.X, T36.Z, literal.x,
+; EG-NEXT:     AND_INT T42.Z, T41.Y, literal.x,
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
 ; EG-NEXT:     LSHR T43.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T42.Y, T39.X, literal.y,
-; EG-NEXT:     LSHR T44.W, T39.W, literal.y,
-; EG-NEXT:     AND_INT * T42.X, T39.X, literal.z,
+; EG-NEXT:     LSHR T42.Y, T41.X, literal.y,
+; EG-NEXT:     LSHR T44.W, T41.W, literal.y,
+; EG-NEXT:     AND_INT * T42.X, T41.X, literal.z,
 ; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T44.Z, T39.W, literal.x,
+; EG-NEXT:     AND_INT T44.Z, T41.W, literal.x,
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
-; EG-NEXT:     LSHR T39.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T44.Y, T39.Z, literal.y,
-; EG-NEXT:     LSHR T45.W, T38.Y, literal.y,
-; EG-NEXT:     AND_INT * T44.X, T39.Z, literal.z,
+; EG-NEXT:     LSHR T41.X, PV.W, literal.x,
+; EG-NEXT:     LSHR T44.Y, T41.Z, literal.y,
+; EG-NEXT:     LSHR T45.W, T40.Y, literal.y,
+; EG-NEXT:     AND_INT * T44.X, T41.Z, literal.z,
 ; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T45.Z, T38.Y, literal.x,
+; EG-NEXT:     AND_INT T45.Z, T40.Y, literal.x,
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
 ; EG-NEXT:     LSHR T46.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T45.Y, T38.X, literal.y,
-; EG-NEXT:     LSHR T47.W, T38.W, literal.y,
-; EG-NEXT:     AND_INT * T45.X, T38.X, literal.z,
+; EG-NEXT:     LSHR T45.Y, T40.X, literal.y,
+; EG-NEXT:     LSHR T47.W, T40.W, literal.y,
+; EG-NEXT:     AND_INT * T45.X, T40.X, literal.z,
 ; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T47.Z, T38.W, literal.x,
+; EG-NEXT:     AND_INT T47.Z, T40.W, literal.x,
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 64(8.968310e-44)
-; EG-NEXT:     LSHR T38.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T47.Y, T38.Z, literal.y,
-; EG-NEXT:     AND_INT * T47.X, T38.Z, literal.z,
+; EG-NEXT:     LSHR T40.X, PV.W, literal.x,
+; EG-NEXT:     LSHR T47.Y, T40.Z, literal.y,
+; EG-NEXT:     AND_INT * T47.X, T40.Z, literal.z,
 ; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
 ; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:     LSHR * T35.W, T37.Y, literal.y,
+; EG-NEXT:     LSHR * T35.W, T39.Y, literal.y,
 ; EG-NEXT:    80(1.121039e-43), 16(2.242078e-44)
 ; EG-NEXT:     LSHR T48.X, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T35.Z, T37.Y, literal.y,
+; EG-NEXT:     AND_INT * T35.Z, T39.Y, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
 ; EG-NEXT:    ALU clause starting at 95:
-; EG-NEXT:     LSHR T35.Y, T37.X, literal.x,
-; EG-NEXT:     LSHR * T53.W, T37.W, literal.x,
+; EG-NEXT:     LSHR T35.Y, T39.X, literal.x,
+; EG-NEXT:     LSHR * T53.W, T39.W, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T35.X, T37.X, literal.x,
-; EG-NEXT:     AND_INT T53.Z, T37.W, literal.x,
+; EG-NEXT:     AND_INT T35.X, T39.X, literal.x,
+; EG-NEXT:     AND_INT T53.Z, T39.W, literal.x,
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 96(1.345247e-43)
-; EG-NEXT:     LSHR T37.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T53.Y, T37.Z, literal.y,
+; EG-NEXT:     LSHR T39.X, PV.W, literal.x,
+; EG-NEXT:     LSHR T53.Y, T39.Z, literal.y,
 ; EG-NEXT:     LSHR T54.W, T52.Y, literal.y,
-; EG-NEXT:     AND_INT * T53.X, T37.Z, literal.z,
+; EG-NEXT:     AND_INT * T53.X, T39.Z, literal.z,
 ; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T54.Z, T52.Y, literal.x,
@@ -3643,101 +3644,103 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_sextload_v64i16_to_v64i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[36:39], s[0:1], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x0
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x10
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[0:15], s[38:39], 0x0
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[16:31], s[38:39], 0x10
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s20, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s21, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s33, s1, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s34, s0, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s35, s1
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s38, s0
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s39, s3, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s40, s2, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s41, s3
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s42, s2
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s43, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s44, s4, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s4, s4
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s22, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s23, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s45, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s46, s6, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s7, s7
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s6, s6
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s24, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s25, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s47, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s48, s8, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s9, s9
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s8, s8
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s26, s11, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s27, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s49, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s50, s10, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s11, s11
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s10, s10
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s28, s13, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s29, s12, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s51, s13, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s52, s12, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s13, s13
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s12, s12
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s30, s15, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s31, s14, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s53, s15, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s54, s14, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s15, s15
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s14, s14
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s33, s17, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s34, s16, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s55, s17, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s56, s16, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s17, s17
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s16, s16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s35, s19, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s52, s18, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s57, s19, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s58, s18, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s19, s19
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s18, s18
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s53, s37, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s54, s36, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s37, s37
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s36, s36
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s55, s39, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s56, s38, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s39, s39
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s38, s38
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s57, s41, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s58, s40, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s41, s41
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s40, s40
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s59, s42, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s60, s43
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s42, s42
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s61, s45, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s62, s44, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s45, s45
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s44, s44
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s63, s47, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s64, s46, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s47, s47
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s46, s46
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s65, s49, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s66, s48, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s49, s49
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s48, s48
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s67, s51, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s68, s50, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s51, s51
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s50, s50
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s43, s43, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s59, s21, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s60, s20, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s21, s21
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s20, s20
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s61, s22, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s62, s23
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s22, s22
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s63, s25, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s64, s24, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s25, s25
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s24, s24
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s65, s27, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s66, s26, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s27, s27
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s26, s26
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s67, s29, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s68, s28, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s29, s29
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s28, s28
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s69, s31, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s70, s30, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s31, s31
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s30, s30
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s23, s23, 16
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s36
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s37
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s50
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s68
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s51
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s67
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s48
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s66
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s49
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s65
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s46
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s64
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s47
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s63
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s44
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s62
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s45
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s61
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s42
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s59
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s60
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s40
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s43
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s58
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, s41
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, s57
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s30
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s70
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s31
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s69
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s28
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s68
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s67
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s26
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s66
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s27
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s65
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s24
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s64
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s25
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s63
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s22
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s61
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s62
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s60
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, s59
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
@@ -3745,64 +3748,64 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(5)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s38
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s18
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s58
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s19
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s57
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s16
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s56
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s17
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s55
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s36
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s14
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s54
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s37
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s15
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s53
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s18
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s52
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s19
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s35
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s34
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s17
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s33
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s31
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s15
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s30
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s52
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s13
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s28
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s51
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s27
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s50
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s11
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s26
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s49
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s25
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s48
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s24
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s47
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s46
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s22
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s45
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s44
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s43
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s42
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s40
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s41
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s39
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s38
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s34
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s35
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s33
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
@@ -3810,58 +3813,51 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x0
-; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_ashr_i32 s20, s5, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s21, s4, 16
-; GCN-HSA-NEXT:    s_sext_i32_i16 s22, s5
-; GCN-HSA-NEXT:    s_sext_i32_i16 s23, s4
-; GCN-HSA-NEXT:    s_ashr_i32 s24, s7, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s25, s6, 16
-; GCN-HSA-NEXT:    s_sext_i32_i16 s26, s7
-; GCN-HSA-NEXT:    s_sext_i32_i16 s27, s6
-; GCN-HSA-NEXT:    s_ashr_i32 s28, s9, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s29, s8, 16
-; GCN-HSA-NEXT:    s_sext_i32_i16 s30, s9
-; GCN-HSA-NEXT:    s_sext_i32_i16 s31, s8
-; GCN-HSA-NEXT:    s_ashr_i32 s33, s11, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s34, s10, 16
-; GCN-HSA-NEXT:    s_sext_i32_i16 s35, s11
-; GCN-HSA-NEXT:    s_sext_i32_i16 s36, s10
-; GCN-HSA-NEXT:    s_ashr_i32 s37, s13, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s38, s12, 16
-; GCN-HSA-NEXT:    s_sext_i32_i16 s39, s13
-; GCN-HSA-NEXT:    s_sext_i32_i16 s40, s12
-; GCN-HSA-NEXT:    s_ashr_i32 s41, s15, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s42, s14, 16
-; GCN-HSA-NEXT:    s_sext_i32_i16 s43, s15
-; GCN-HSA-NEXT:    s_sext_i32_i16 s44, s14
-; GCN-HSA-NEXT:    s_ashr_i32 s45, s17, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s46, s16, 16
-; GCN-HSA-NEXT:    s_sext_i32_i16 s47, s17
-; GCN-HSA-NEXT:    s_sext_i32_i16 s48, s16
-; GCN-HSA-NEXT:    s_ashr_i32 s49, s19, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s50, s18, 16
-; GCN-HSA-NEXT:    s_sext_i32_i16 s51, s19
-; GCN-HSA-NEXT:    s_sext_i32_i16 s52, s18
-; GCN-HSA-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x10
+; GCN-HSA-NEXT:    s_load_dwordx16 s[16:31], s[2:3], 0x0
+; GCN-HSA-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x10
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_ashr_i32 s53, s5, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s54, s4, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s55, s7, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s56, s6, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s57, s9, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s58, s8, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s59, s11, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s60, s10, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s61, s13, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s62, s12, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s63, s15, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s64, s14, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s65, s17, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s66, s16, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s67, s19, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s68, s18, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s4, s17, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s5, s16, 16
+; GCN-HSA-NEXT:    s_sext_i32_i16 s6, s17
+; GCN-HSA-NEXT:    s_sext_i32_i16 s7, s16
+; GCN-HSA-NEXT:    s_ashr_i32 s8, s19, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s9, s18, 16
+; GCN-HSA-NEXT:    s_sext_i32_i16 s10, s19
+; GCN-HSA-NEXT:    s_sext_i32_i16 s11, s18
+; GCN-HSA-NEXT:    s_ashr_i32 s12, s21, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s13, s20, 16
+; GCN-HSA-NEXT:    s_sext_i32_i16 s14, s21
+; GCN-HSA-NEXT:    s_sext_i32_i16 s15, s20
+; GCN-HSA-NEXT:    s_ashr_i32 s16, s23, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s17, s22, 16
+; GCN-HSA-NEXT:    s_sext_i32_i16 s18, s23
+; GCN-HSA-NEXT:    s_sext_i32_i16 s19, s22
+; GCN-HSA-NEXT:    s_ashr_i32 s20, s25, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s21, s24, 16
+; GCN-HSA-NEXT:    s_sext_i32_i16 s22, s25
+; GCN-HSA-NEXT:    s_sext_i32_i16 s23, s24
+; GCN-HSA-NEXT:    s_ashr_i32 s24, s27, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s25, s26, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s33, s29, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s34, s28, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s35, s31, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s52, s30, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s53, s37, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s54, s36, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s55, s39, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s56, s38, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s57, s41, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s58, s40, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s59, s43, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s60, s42, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s61, s45, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s62, s44, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s63, s47, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s64, s46, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s65, s49, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s66, s48, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s67, s51, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s68, s50, 16
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s3
@@ -3885,13 +3881,13 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s3
-; GCN-HSA-NEXT:    s_sext_i32_i16 s17, s17
-; GCN-HSA-NEXT:    s_sext_i32_i16 s16, s16
+; GCN-HSA-NEXT:    s_sext_i32_i16 s49, s49
+; GCN-HSA-NEXT:    s_sext_i32_i16 s48, s48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s66
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s17
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s49
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s65
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
@@ -3900,155 +3896,145 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s3
-; GCN-HSA-NEXT:    s_sext_i32_i16 s11, s11
-; GCN-HSA-NEXT:    s_sext_i32_i16 s10, s10
+; GCN-HSA-NEXT:    s_sext_i32_i16 s43, s43
+; GCN-HSA-NEXT:    s_sext_i32_i16 s42, s42
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v34, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s42
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s60
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s43
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s59
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[16:19]
-; GCN-HSA-NEXT:    s_sext_i32_i16 s19, s19
+; GCN-HSA-NEXT:    s_sext_i32_i16 s51, s51
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    s_sext_i32_i16 s18, s18
+; GCN-HSA-NEXT:    s_sext_i32_i16 s50, s50
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s3
-; GCN-HSA-NEXT:    s_sext_i32_i16 s4, s4
-; GCN-HSA-NEXT:    s_sext_i32_i16 s7, s7
-; GCN-HSA-NEXT:    s_sext_i32_i16 s6, s6
-; GCN-HSA-NEXT:    s_sext_i32_i16 s9, s9
-; GCN-HSA-NEXT:    s_sext_i32_i16 s8, s8
-; GCN-HSA-NEXT:    s_sext_i32_i16 s13, s13
-; GCN-HSA-NEXT:    s_sext_i32_i16 s12, s12
-; GCN-HSA-NEXT:    s_sext_i32_i16 s15, s15
-; GCN-HSA-NEXT:    s_sext_i32_i16 s14, s14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s18
+; GCN-HSA-NEXT:    s_sext_i32_i16 s36, s36
+; GCN-HSA-NEXT:    s_sext_i32_i16 s39, s39
+; GCN-HSA-NEXT:    s_sext_i32_i16 s38, s38
+; GCN-HSA-NEXT:    s_sext_i32_i16 s41, s41
+; GCN-HSA-NEXT:    s_sext_i32_i16 s40, s40
+; GCN-HSA-NEXT:    s_sext_i32_i16 s45, s45
+; GCN-HSA-NEXT:    s_sext_i32_i16 s44, s44
+; GCN-HSA-NEXT:    s_sext_i32_i16 s47, s47
+; GCN-HSA-NEXT:    s_sext_i32_i16 s46, s46
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s50
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s68
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s19
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s51
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s67
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
-; GCN-HSA-NEXT:    s_sext_i32_i16 s5, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s14
+; GCN-HSA-NEXT:    s_sext_i32_i16 s29, s29
+; GCN-HSA-NEXT:    s_sext_i32_i16 s28, s28
+; GCN-HSA-NEXT:    s_sext_i32_i16 s31, s31
+; GCN-HSA-NEXT:    s_sext_i32_i16 s30, s30
+; GCN-HSA-NEXT:    s_sext_i32_i16 s37, s37
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s46
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s64
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s47
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s63
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s44
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s62
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s45
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s61
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s40
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s58
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s9
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s38
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s41
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s57
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s56
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s39
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s36
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s55
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s54
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s52
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s37
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s30
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[12:15]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s53
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s51
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s49
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s46
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s28
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s52
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s31
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s35
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s34
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[20:23]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s47
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s45
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s29
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s33
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[34:35], v[4:7]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    s_sext_i32_i16 s27, s27
+; GCN-HSA-NEXT:    s_sext_i32_i16 s26, s26
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s44
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s42
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s43
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s41
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s26
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s25
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s27
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s24
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s40
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s38
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s39
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s37
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s23
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s22
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s20
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s36
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s34
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s35
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s33
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s19
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s17
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s18
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s31
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s29
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s30
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s28
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s12
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s27
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s25
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s26
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s24
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s23
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s21
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s22
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s20
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: constant_sextload_v64i16_to_v64i32:
 ; GCN-NOHSA-VI:       ; %bb.0:
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[36:39], s[0:1], 0x24
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x40
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[36:51], s[18:19], 0x0
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[0:15], s[38:39], 0x40
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[16:31], s[38:39], 0x0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s69, s15, 16
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s70, s14, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s15, s15
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s14, s14
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s18, s37, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s20, s37
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s22, s39, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s24, s39
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s26, s41, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s28, s41
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s30, s43, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s33, s43
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s35, s45, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s37, s45
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s39, s47, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s41, s47
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s43, s49, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s45, s49
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s47, s51, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s49, s51
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s51, s1, 16
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s52, s0, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s53, s1
@@ -4063,8 +4049,8 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s12, s12
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s16
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s17
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s36
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s37
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s14
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s70
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s15
@@ -4101,90 +4087,99 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s4, s4
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s19, s36, 16
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s49, s31, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s62
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s61
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s21, s36
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s50, s30, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s60
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s59
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s23, s38, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s31, s31
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s58
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s56
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s57
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s55
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s25, s38
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s27, s40, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s29, s40
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s31, s42, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s34, s42
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s36, s44, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s38, s44
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s40, s46, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s42, s46
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s44, s48, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s46, s48
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s48, s50, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s50, s50
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s30, s30
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s47, s29, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s54
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s52
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s53
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s51
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s48, s28, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s29, s29
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s28, s28
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s50
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s45, s27, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s30
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s50
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s31
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s49
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s46, s26, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s27, s27
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s26, s26
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s43, s25, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s28
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s48
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s49
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s29
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s47
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s46
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s44
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s45
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s43
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s44, s24, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s25, s25
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s24, s24
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s42
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s40
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s41
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s39
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s41, s23, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s26
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s46
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s27
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s45
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s42, s22, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s23, s23
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s22, s22
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s38
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s36
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s37
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s35
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s39, s21, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s24
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s44
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s25
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s43
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s40, s20, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s21, s21
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s20, s20
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s34
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s31
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s33
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s30
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s35, s19, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s22
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s42
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s23
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s41
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s38, s18, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s19, s19
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s18, s18
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s29
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s27
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s28
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s26
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s33, s17, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s40
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s21
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s39
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s34, s16, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s17, s17
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s16, s16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s25
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s23
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s24
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s22
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s18
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s38
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s19
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s35
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s21
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s19
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s20
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s18
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s34
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s17
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s33
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
@@ -5115,40 +5110,40 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(
 define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v8i16_to_v8i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, 0xffff
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s0, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s1, s11, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s3, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s4, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s5, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s7, s10, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s12, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, s2
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, s2
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, s2
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s2, s9, s2
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, s6
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, s6
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, s6
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s9, s6
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s3
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s12
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i64:
@@ -5860,185 +5855,186 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspa
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GCN-HSA-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_mov_b32 s2, s11
-; GCN-HSA-NEXT:    s_mov_b32 s12, s9
-; GCN-HSA-NEXT:    s_mov_b32 s14, s7
-; GCN-HSA-NEXT:    s_mov_b32 s16, s5
-; GCN-HSA-NEXT:    s_lshr_b32 s18, s10, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s20, s8, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s22, s6, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s24, s4, 16
-; GCN-HSA-NEXT:    s_bfe_i64 s[34:35], s[10:11], 0x100000
-; GCN-HSA-NEXT:    s_ashr_i64 s[10:11], s[10:11], 48
-; GCN-HSA-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[26:27], s[4:5], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[28:29], s[6:7], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[30:31], s[8:9], 0x100000
-; GCN-HSA-NEXT:    s_ashr_i64 s[4:5], s[4:5], 48
-; GCN-HSA-NEXT:    s_ashr_i64 s[6:7], s[6:7], 48
+; GCN-HSA-NEXT:    s_mov_b32 s6, s15
+; GCN-HSA-NEXT:    s_mov_b32 s16, s13
+; GCN-HSA-NEXT:    s_mov_b32 s18, s11
+; GCN-HSA-NEXT:    s_mov_b32 s20, s9
+; GCN-HSA-NEXT:    s_lshr_b32 s22, s14, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s24, s12, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s26, s10, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s28, s8, 16
+; GCN-HSA-NEXT:    s_bfe_i64 s[34:35], s[14:15], 0x100000
+; GCN-HSA-NEXT:    s_ashr_i64 s[14:15], s[14:15], 48
+; GCN-HSA-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[2:3], s[8:9], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[4:5], s[10:11], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[30:31], s[12:13], 0x100000
 ; GCN-HSA-NEXT:    s_ashr_i64 s[8:9], s[8:9], 48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s10
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s11
-; GCN-HSA-NEXT:    s_bfe_i64 s[2:3], s[24:25], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[10:11], s[22:23], 0x100000
+; GCN-HSA-NEXT:    s_ashr_i64 s[10:11], s[10:11], 48
+; GCN-HSA-NEXT:    s_ashr_i64 s[12:13], s[12:13], 48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s15
+; GCN-HSA-NEXT:    s_bfe_i64 s[6:7], s[28:29], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[14:15], s[26:27], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x100000
-; GCN-HSA-NEXT:    s_add_u32 s22, s0, 0x70
-; GCN-HSA-NEXT:    s_addc_u32 s23, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s8
-; GCN-HSA-NEXT:    s_add_u32 s8, s0, 0x50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s22
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s9
-; GCN-HSA-NEXT:    s_addc_u32 s9, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s23
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s8
+; GCN-HSA-NEXT:    s_add_u32 s26, s0, 0x70
+; GCN-HSA-NEXT:    s_addc_u32 s27, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s26
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s12
+; GCN-HSA-NEXT:    s_add_u32 s12, s0, 0x50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s27
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s13
+; GCN-HSA-NEXT:    s_addc_u32 s13, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s17
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s13
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-HSA-NEXT:    s_add_u32 s6, s0, 48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s7
-; GCN-HSA-NEXT:    s_addc_u32 s7, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-HSA-NEXT:    s_add_u32 s10, s0, 48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-HSA-NEXT:    s_addc_u32 s11, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s18
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s19
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s11
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_nop 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s5
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s17
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x60
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-HSA-NEXT:    s_add_u32 s8, s0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s9
+; GCN-HSA-NEXT:    s_addc_u32 s9, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s9
+; GCN-HSA-NEXT:    s_add_u32 s8, s0, 0x60
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT:    s_addc_u32 s9, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s35
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s18
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s19
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 64
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s22
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s23
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s9
+; GCN-HSA-NEXT:    s_add_u32 s8, s0, 64
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT:    s_addc_u32 s9, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s30
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s31
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s20
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s21
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s24
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s25
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s9
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s28
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s29
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s10
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s15
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s26
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s27
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: constant_sextload_v16i16_to_v16i64:
 ; GCN-NOHSA-VI:       ; %bb.0:
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, -1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s0
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s1
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[30:31], s[10:11], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s10, 16
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[34:35], s[10:11], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s11
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[24:25], s[8:9], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s8, s8, 16
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[36:37], s[10:11], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s11, 16
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[26:27], s[8:9], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s9
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[18:19], s[6:7], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[30:31], s[6:7], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s6, 16
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[28:29], s[8:9], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s8, s9, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[34:35], s[6:7], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, s7
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[24:25], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s4, s4, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[36:37], s[6:7], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s7, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[26:27], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s5
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[18:19], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s2, s2, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[28:29], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s4, s5, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s36
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s37
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s10
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s11
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[20:21], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, s7
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x100000
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[22:23], s[6:7], 0x100000
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[20:21], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, s3
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[22:23], s[2:3], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s30
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s31
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s34
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s35
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s7, 16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s2, s3, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s28
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s29
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s8
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s9
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s14, s5
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s14, s1
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s24
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s25
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s26
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s27
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s16, s5, 16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[12:13], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s16, s1, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[12:13], s[0:1], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s22
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s23
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s7
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s4, s4, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s0, s0, 16
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x100000
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s18
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s19
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s20
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s21
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s14
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s15
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s17
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s13
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s5
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: constant_sextload_v16i16_to_v16i64:
@@ -6136,109 +6132,111 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspa
 define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v32i16_to_v32i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s18, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s21, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s23, s11, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s13, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s25, s15, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s17, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s27, s19, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s28, s4, s2
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s29, s6, s2
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s30, s8, s2
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s31, s10, s2
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s33, s12, s2
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s34, s14, s2
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s35, s16, s2
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s36, s18, s2
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, s2
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, s2
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, s2
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, s2
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s13, s13, s2
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s15, s15, s2
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s17, s17, s2
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s19, s19, s2
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s18, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s16, s16, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s1, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s3, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s21, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s23, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s25, s13, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s15, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s27, s0, s18
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s28, s2, s18
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s29, s4, s18
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s30, s6, s18
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s31, s8, s18
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s33, s10, s18
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s34, s12, s18
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s35, s14, s18
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s36, s1, s18
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s37, s3, s18
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, s18
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, s18
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, s18
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, s18
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s13, s13, s18
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s15, s15, s18
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s14, s14, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s12, s12, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s10, s10, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s8, s8, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s6, s6, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s4, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s2, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s38, s0, 16
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s19
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s27
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s16
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s17
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s15
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s26
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s17
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s26
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s25
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s15
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s25
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s24
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s13
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s24
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s23
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s22
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s22
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s21
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s37
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s20
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s36
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s19
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s36
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s18
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s35
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s14
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s35
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s34
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s12
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s34
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s33
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s10
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s33
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s31
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s8
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s31
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s30
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s30
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s29
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s28
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s18
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s28
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s27
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s38
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
@@ -6668,26 +6666,26 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s18, s15
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s20, s13
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s22, s11
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s24, s9
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s26, s7
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s28, s5
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s30, s3
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s34, s1
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s36, s14, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s38, s12, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s40, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s42, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[44:45], s[18:19], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s46, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s48, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s50, s2, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s52, s0, 16
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s36, s11
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s40, s9
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s44, s7
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s46, s5
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s38, s3
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s42, s1
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s14, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s12, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[48:49], s[20:21], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[50:51], s[18:19], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s52, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s54, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s56, s2, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s58, s0, 16
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[18:19], s[0:1], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[54:55], s[2:3], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[56:57], s[4:5], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[58:59], s[6:7], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[2:3], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[22:23], s[4:5], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[30:31], s[6:7], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[60:61], s[8:9], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[62:63], s[10:11], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[64:65], s[12:13], 0x100000
@@ -6702,24 +6700,24 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[4:5], s[4:5], 48
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s16
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s17
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s44
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s45
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s50
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s51
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s12
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s13
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s48
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s49
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s2
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s3
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[28:29], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[26:27], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[24:25], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[22:23], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[22:23], s[34:35], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[24:25], s[30:31], 0x100000
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s21
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[46:47], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[44:45], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[40:41], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[40:41], s[42:43], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x100000
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s36
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s37
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s10
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s11
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s16
@@ -6735,28 +6733,28 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, s4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[52:53], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[50:51], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[8:9], s[48:49], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[46:47], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[42:43], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[40:41], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[38:39], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[36:37], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[58:59], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[56:57], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[8:9], s[54:55], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[52:53], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[34:35], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[28:29], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[26:27], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x100000
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(5)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s25
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s38
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s39
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s70
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s71
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s22
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s40
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s41
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s68
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s69
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
@@ -6769,14 +6767,14 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s63
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s60
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s61
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s58
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s59
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s56
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s57
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v24, s54
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v25, s55
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s30
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s31
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s22
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v24, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v25, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s24
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s25
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s18
@@ -6810,139 +6808,139 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_mov_b32 s38, s15
-; GCN-HSA-NEXT:    s_mov_b32 s40, s13
-; GCN-HSA-NEXT:    s_mov_b32 s42, s11
-; GCN-HSA-NEXT:    s_mov_b32 s44, s9
-; GCN-HSA-NEXT:    s_mov_b32 s46, s7
-; GCN-HSA-NEXT:    s_mov_b32 s48, s5
-; GCN-HSA-NEXT:    s_mov_b32 s50, s3
-; GCN-HSA-NEXT:    s_mov_b32 s52, s1
-; GCN-HSA-NEXT:    s_lshr_b32 s54, s14, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s56, s12, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s58, s10, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s60, s8, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s62, s6, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s64, s4, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s66, s2, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s68, s0, 16
+; GCN-HSA-NEXT:    s_mov_b32 s42, s15
+; GCN-HSA-NEXT:    s_mov_b32 s44, s13
+; GCN-HSA-NEXT:    s_mov_b32 s46, s11
+; GCN-HSA-NEXT:    s_mov_b32 s48, s9
+; GCN-HSA-NEXT:    s_mov_b32 s50, s7
+; GCN-HSA-NEXT:    s_mov_b32 s52, s5
+; GCN-HSA-NEXT:    s_mov_b32 s54, s3
+; GCN-HSA-NEXT:    s_mov_b32 s56, s1
+; GCN-HSA-NEXT:    s_lshr_b32 s58, s14, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s60, s12, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s62, s10, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s64, s8, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s66, s6, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s68, s4, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s70, s2, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s72, s0, 16
 ; GCN-HSA-NEXT:    s_bfe_i64 s[18:19], s[0:1], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[20:21], s[2:3], 0x100000
 ; GCN-HSA-NEXT:    s_ashr_i64 s[36:37], s[0:1], 48
-; GCN-HSA-NEXT:    s_ashr_i64 s[70:71], s[2:3], 48
+; GCN-HSA-NEXT:    s_ashr_i64 s[38:39], s[2:3], 48
 ; GCN-HSA-NEXT:    s_ashr_i64 s[0:1], s[14:15], 48
-; GCN-HSA-NEXT:    s_bfe_i64 s[2:3], s[38:39], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[2:3], s[42:43], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[22:23], s[4:5], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[24:25], s[6:7], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[26:27], s[8:9], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[28:29], s[10:11], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[30:31], s[12:13], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[34:35], s[14:15], 0x100000
-; GCN-HSA-NEXT:    s_ashr_i64 s[72:73], s[4:5], 48
+; GCN-HSA-NEXT:    s_ashr_i64 s[40:41], s[4:5], 48
 ; GCN-HSA-NEXT:    s_ashr_i64 s[74:75], s[6:7], 48
 ; GCN-HSA-NEXT:    s_ashr_i64 s[76:77], s[8:9], 48
 ; GCN-HSA-NEXT:    s_ashr_i64 s[78:79], s[10:11], 48
-; GCN-HSA-NEXT:    s_ashr_i64 s[12:13], s[12:13], 48
+; GCN-HSA-NEXT:    s_ashr_i64 s[80:81], s[12:13], 48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-HSA-NEXT:    s_bfe_i64 s[0:1], s[68:69], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[2:3], s[66:67], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[4:5], s[64:65], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[6:7], s[62:63], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[8:9], s[60:61], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[10:11], s[58:59], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[14:15], s[56:57], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[38:39], s[54:55], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[0:1], s[72:73], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[2:3], s[70:71], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[4:5], s[68:69], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[6:7], s[66:67], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[8:9], s[64:65], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[10:11], s[62:63], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[12:13], s[60:61], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[14:15], s[58:59], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[42:43], s[56:57], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[54:55], s[54:55], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[52:53], s[52:53], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[50:51], s[50:51], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[48:49], s[48:49], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x100000
-; GCN-HSA-NEXT:    s_add_u32 s54, s16, 0xf0
-; GCN-HSA-NEXT:    s_addc_u32 s55, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s12
-; GCN-HSA-NEXT:    s_add_u32 s12, s16, 0xd0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s13
-; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s12
-; GCN-HSA-NEXT:    s_add_u32 s12, s16, 0xb0
-; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s12
-; GCN-HSA-NEXT:    s_add_u32 s12, s16, 0x90
-; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s12
-; GCN-HSA-NEXT:    s_add_u32 s12, s16, 0x70
-; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v31, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v30, s12
-; GCN-HSA-NEXT:    s_add_u32 s12, s16, 0x50
-; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s12
-; GCN-HSA-NEXT:    s_add_u32 s12, s16, 48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s40
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s41
-; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
+; GCN-HSA-NEXT:    s_add_u32 s56, s16, 0xf0
+; GCN-HSA-NEXT:    s_addc_u32 s57, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s44
+; GCN-HSA-NEXT:    s_add_u32 s44, s16, 0xd0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s45
+; GCN-HSA-NEXT:    s_addc_u32 s45, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s44
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s45
+; GCN-HSA-NEXT:    s_add_u32 s44, s16, 0xb0
+; GCN-HSA-NEXT:    s_addc_u32 s45, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s44
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s45
+; GCN-HSA-NEXT:    s_add_u32 s44, s16, 0x90
+; GCN-HSA-NEXT:    s_addc_u32 s45, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s44
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s56
+; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s45
+; GCN-HSA-NEXT:    s_add_u32 s44, s16, 0x70
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s57
+; GCN-HSA-NEXT:    s_addc_u32 s45, s17, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s40
+; GCN-HSA-NEXT:    s_add_u32 s40, s16, 0x50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s41
+; GCN-HSA-NEXT:    s_addc_u32 s41, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s80
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s81
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s38
+; GCN-HSA-NEXT:    s_add_u32 s38, s16, 48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s39
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s12
-; GCN-HSA-NEXT:    s_add_u32 s12, s16, 16
-; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v34, s12
-; GCN-HSA-NEXT:    s_add_u32 s12, s16, 0xe0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s46
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s47
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s74
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s75
-; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s54
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s12
-; GCN-HSA-NEXT:    s_add_u32 s12, s16, 0xc0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s55
-; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s42
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s43
+; GCN-HSA-NEXT:    s_addc_u32 s39, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s38
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s39
+; GCN-HSA-NEXT:    s_add_u32 s38, s16, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s46
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s47
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s78
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s79
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s44
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s45
+; GCN-HSA-NEXT:    s_addc_u32 s39, s17, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s14
+; GCN-HSA-NEXT:    s_add_u32 s14, s16, 0xe0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s49
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s76
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s77
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s49
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s72
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s73
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s51
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s70
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s71
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s15
+; GCN-HSA-NEXT:    s_addc_u32 s15, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v30, s44
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[12:15]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s12
+; GCN-HSA-NEXT:    s_add_u32 s12, s16, 0xc0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s51
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s74
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s75
+; GCN-HSA-NEXT:    v_mov_b32_e32 v31, s45
+; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s40
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s13
+; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s52
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s53
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s54
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s55
+; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s41
+; GCN-HSA-NEXT:    v_mov_b32_e32 v34, s38
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[16:19]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s42
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s15
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s52
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s53
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s43
+; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s39
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s36
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s37
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s34
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[12:15]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s35
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s30
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s38
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s39
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s31
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s14
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[20:23]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s15
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s12
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[34:35], v[4:7]
@@ -7012,147 +7010,152 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s24, s0, 16
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s26, s1
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s28, s1, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s30, s2, 16
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s34, s3
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s36, s3, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s38, s4, 16
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s40, s5
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s42, s5, 16
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s46, s7
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s48, s7, 16
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s52, s9
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s54, s9, 16
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s58, s11
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s60, s11, 16
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s64, s13
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s66, s13, 16
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s70, s15
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s72, s15, 16
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s30, s1
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s34, s1, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s46, s5, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s48, s6, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s62, s10, 16
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s64, s11
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s80, s15
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s82, s15, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s28, s0, 16
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s44, s5
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[24:25], s[6:7], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s50, s7
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s52, s7, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[26:27], s[8:9], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s54, s8, 16
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s56, s9
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s58, s9, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s78, s14, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[8:9], s[30:31], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[6:7], s[34:35], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[30:31], s[46:47], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[34:35], s[48:49], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[46:47], s[62:63], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[48:49], s[64:65], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[62:63], s[80:81], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[64:65], s[82:83], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[18:19], s[0:1], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[22:23], s[4:5], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[68:69], s[14:15], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s14, s14, 16
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s16
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s17
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[4:5], s[24:25], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[16:17], s[26:27], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[24:25], s[28:29], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[26:27], s[30:31], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[28:29], s[34:35], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[30:31], s[36:37], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[34:35], s[38:39], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[36:37], s[40:41], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[38:39], s[42:43], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[40:41], s[46:47], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[42:43], s[48:49], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[46:47], s[52:53], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[48:49], s[54:55], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[52:53], s[58:59], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[54:55], s[60:61], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[58:59], s[64:65], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[60:61], s[66:67], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[64:65], s[70:71], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[66:67], s[72:73], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[20:21], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s36, s2, 16
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s38, s3
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s40, s3, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[22:23], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s42, s4, 16
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s72, s13
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s74, s13, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[76:77], s[14:15], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x100000
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s64
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s65
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s66
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s67
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[62:63], s[12:13], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s12, s12, 16
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s16
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s17
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[4:5], s[28:29], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[28:29], s[44:45], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[44:45], s[58:59], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[58:59], s[78:79], 0x100000
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s62
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s63
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s64
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s65
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[68:69], s[12:13], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s70, s12, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[12:13], s[40:41], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[16:17], s[42:43], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[40:41], s[54:55], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[42:43], s[56:57], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[54:55], s[72:73], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[56:57], s[74:75], 0x100000
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x100000
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s68
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s69
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s14
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s15
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s66, s11, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s76
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s77
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s58
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s59
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[14:15], s[38:39], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[38:39], s[52:53], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[52:53], s[70:71], 0x100000
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[56:57], s[10:11], 0x100000
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s58
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s59
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s60
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s61
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s10, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[60:61], s[10:11], 0x100000
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s54
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s55
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s57
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[10:11], s[36:37], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[36:37], s[50:51], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[50:51], s[66:67], 0x100000
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x100000
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s62
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s63
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s12
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s13
+; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s68
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s69
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s52
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s53
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[50:51], s[8:9], 0x100000
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s52
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s53
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s54
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s55
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s8, s8, 16
+; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s48
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s49
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s50
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s51
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x100000
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s56
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s10
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s60
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s61
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s46
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s47
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[44:45], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s46
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s47
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s48
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s49
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s6, 16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s50
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s51
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s8
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s9
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s40
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s41
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s42
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s43
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s42
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s43
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s44
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s45
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s44
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s45
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s7
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s26
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s27
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s40
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s41
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s36
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s37
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s38
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s39
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s22
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s24
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s25
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s34
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s35
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s28
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s29
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s30
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s31
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s22
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s17
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s15
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s12
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s13
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s20
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s21
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s26
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s27
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s11
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s17
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s24
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s25
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s18

diff  --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 481ab8d39647d..c3c04b4729c3c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -2123,18 +2123,18 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)
 define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_v16i16_to_v16i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
@@ -2144,20 +2144,20 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, s2, v1
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, s2, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v14, s2, v3
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, s2, v2
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, s6, v1
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, s6, v0
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v14, s6, v3
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, s6, v2
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, s2, v5
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, s2, v4
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, s2, v7
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, s2, v6
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, s6, v5
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, s6, v4
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, s6, v7
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, s6, v6
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32:
@@ -2362,17 +2362,17 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace
 define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_sextload_v16i16_to_v16i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v1
@@ -2392,10 +2392,10 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v17, 16, v6
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v18, v7, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v16, v6, 0, 16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i32:
@@ -3591,24 +3591,24 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    s_movk_i32 s14, 0x50
-; GCN-HSA-NEXT:    s_movk_i32 s15, 0x60
-; GCN-HSA-NEXT:    s_movk_i32 s16, 0x70
-; GCN-HSA-NEXT:    s_mov_b32 s17, 0xffff
+; GCN-HSA-NEXT:    s_movk_i32 s6, 0x50
+; GCN-HSA-NEXT:    s_movk_i32 s7, 0x60
+; GCN-HSA-NEXT:    s_movk_i32 s8, 0x70
+; GCN-HSA-NEXT:    s_mov_b32 s9, 0xffff
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_add_u32 s4, s2, s14
+; GCN-HSA-NEXT:    s_add_u32 s4, s2, s6
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s2, s15
+; GCN-HSA-NEXT:    s_add_u32 s4, s2, s7
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s2, s16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-HSA-NEXT:    s_add_u32 s4, s2, s8
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s5
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[2:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s4
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -3616,19 +3616,19 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s4
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    s_add_u32 s6, s2, 48
+; GCN-HSA-NEXT:    s_add_u32 s10, s2, 48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s3
-; GCN-HSA-NEXT:    s_addc_u32 s7, s3, 0
+; GCN-HSA-NEXT:    s_addc_u32 s11, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 64
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s11
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s10
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s2
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[20:21]
@@ -3638,61 +3638,61 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s6, s0, 0xf0
-; GCN-HSA-NEXT:    s_addc_u32 s7, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s8, s0, 0xc0
-; GCN-HSA-NEXT:    s_addc_u32 s9, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s10, s0, 0xd0
+; GCN-HSA-NEXT:    s_add_u32 s10, s0, 0xf0
 ; GCN-HSA-NEXT:    s_addc_u32 s11, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s12, s0, 0xa0
+; GCN-HSA-NEXT:    s_add_u32 s12, s0, 0xc0
 ; GCN-HSA-NEXT:    s_addc_u32 s13, s1, 0
+; GCN-HSA-NEXT:    s_add_u32 s14, s0, 0xd0
+; GCN-HSA-NEXT:    s_addc_u32 s15, s1, 0
+; GCN-HSA-NEXT:    s_add_u32 s16, s0, 0xa0
+; GCN-HSA-NEXT:    s_addc_u32 s17, s1, 0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v1
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v26, s17, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v24, s17, v0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s13
-; GCN-HSA-NEXT:    s_add_u32 s12, s0, 0xb0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
-; GCN-HSA-NEXT:    s_addc_u32 s13, s1, 0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v5
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v4
+; GCN-HSA-NEXT:    v_and_b32_e32 v26, s9, v5
+; GCN-HSA-NEXT:    v_and_b32_e32 v24, s9, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s17
+; GCN-HSA-NEXT:    s_add_u32 s16, s0, 0xb0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[24:27]
+; GCN-HSA-NEXT:    s_addc_u32 s17, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s16
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v7
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v6
+; GCN-HSA-NEXT:    v_and_b32_e32 v26, s9, v7
+; GCN-HSA-NEXT:    v_and_b32_e32 v24, s9, v6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s17
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[24:27]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, s9, v1
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, s9, v0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v3
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v26, s17, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v24, s17, v2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s13
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s17, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s17, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s9
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s10
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s17, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s17, v6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s11
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, s9, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v3, s9, v2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s15
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s17, v9
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s17, v8
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, s9, v9
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, s9, v8
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s10
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s1
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v10
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s17, v11
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s17, v10
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s7
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, s9, v11
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, s9, v10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s11
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s5
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
@@ -3701,12 +3701,12 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(6)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v33
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v32
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s17, v33
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s17, v32
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, s9, v33
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, s9, v32
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v35
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v34
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, s17, v35
-; GCN-HSA-NEXT:    v_and_b32_e32 v4, s17, v34
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, s9, v35
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, s9, v34
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[4:7]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
@@ -3717,50 +3717,50 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v29
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v28
-; GCN-HSA-NEXT:    v_and_b32_e32 v10, s17, v29
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, s17, v28
+; GCN-HSA-NEXT:    v_and_b32_e32 v10, s9, v29
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, s9, v28
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, s15
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, s7
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v31
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v30
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s17, v31
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s17, v30
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, s9, v31
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, s9, v30
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, s16
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, s8
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v21
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v20
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s17, v21
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s17, v20
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, s9, v21
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, s9, v20
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v22
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, s17, v23
-; GCN-HSA-NEXT:    v_and_b32_e32 v4, s17, v22
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, s9, v23
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, s9, v22
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v15
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v14
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s17, v15
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, s9, v15
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v13
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v12
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s17, v14
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, s17, v13
-; GCN-HSA-NEXT:    v_and_b32_e32 v4, s17, v12
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, s9, v14
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, s9, v13
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, s9, v12
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
-; GCN-HSA-NEXT:    v_and_b32_e32 v14, s17, v17
-; GCN-HSA-NEXT:    v_and_b32_e32 v12, s17, v16
+; GCN-HSA-NEXT:    v_and_b32_e32 v14, s9, v17
+; GCN-HSA-NEXT:    v_and_b32_e32 v12, s9, v16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, s14
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, s6
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
@@ -3768,8 +3768,8 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
-; GCN-HSA-NEXT:    v_and_b32_e32 v10, s17, v19
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, s17, v18
+; GCN-HSA-NEXT:    v_and_b32_e32 v10, s9, v19
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, s9, v18
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
@@ -3798,79 +3798,79 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s3
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v15
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v30, s0, v15
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v14
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, s0, v14
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v28, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, s0, v19
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v18
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, s0, v18
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v29, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v30, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v31, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[56:59], off, s[8:11], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v13
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v30, s0, v13
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v12
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, s0, v12
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v19
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v34, s0, v19
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v32, s0, v18
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v18, s0, v17
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, s0, v16
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v23
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v38, s0, v23
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v36, s0, v22
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v23
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v34, s0, v23
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v32, s0, v22
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v22, s0, v21
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v20
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, s0, v20
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v27
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v42, s0, v27
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v26
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v40, s0, v26
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v38, s0, v27
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v36, s0, v26
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v25
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v26, s0, v25
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v24
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, s0, v24
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v47, 16, v11
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v46, s0, v11
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v45, 16, v10
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v44, s0, v10
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v9
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v14, s0, v9
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v8
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, s0, v8
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v7
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v50, s0, v7
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v48, s0, v6
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v10, s0, v5
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, s0, v4
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v3
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v54, s0, v3
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v52, s0, v2
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v6, s0, v1
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, s0, v0
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v31
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v42, s0, v31
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v30
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v40, s0, v30
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v29
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v30, s0, v29
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v28
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, s0, v28
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v47, 16, v15
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v46, s0, v15
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v45, 16, v14
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v44, s0, v14
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v13
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v14, s0, v13
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, s0, v12
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v11
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v50, s0, v11
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v10
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v48, s0, v10
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v10, s0, v9
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, s0, v8
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v7
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v54, s0, v7
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v6
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v52, s0, v6
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v6, s0, v5
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, s0, v4
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v61, 16, v59
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v60, s0, v59
@@ -3889,13 +3889,13 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
@@ -3919,20 +3919,20 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T59.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T52.XYZW, T58.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T56.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T55.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T55.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T53.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T48.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T48.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T47.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T46.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T46.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T44.X, 0
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T43.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T41.X, 1
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 22:
 ; EG-NEXT:     VTX_READ_128 T36.XYZW, T35.X, 0, #1
-; EG-NEXT:     VTX_READ_128 T37.XYZW, T35.X, 48, #1
-; EG-NEXT:     VTX_READ_128 T38.XYZW, T35.X, 32, #1
-; EG-NEXT:     VTX_READ_128 T39.XYZW, T35.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T38.XYZW, T35.X, 48, #1
+; EG-NEXT:     VTX_READ_128 T39.XYZW, T35.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T40.XYZW, T35.X, 16, #1
 ; EG-NEXT:    Fetch clause starting at 30:
 ; EG-NEXT:     VTX_READ_128 T49.XYZW, T35.X, 112, #1
 ; EG-NEXT:     VTX_READ_128 T50.XYZW, T35.X, 96, #1
@@ -3941,75 +3941,75 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; EG-NEXT:    ALU clause starting at 38:
 ; EG-NEXT:     MOV * T35.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 39:
-; EG-NEXT:     LSHR * T40.W, T36.W, literal.x,
+; EG-NEXT:     LSHR * T37.W, T36.W, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T40.Z, T36.W, literal.x,
+; EG-NEXT:     AND_INT * T37.Z, T36.W, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHR T40.Y, T36.Z, literal.x,
+; EG-NEXT:     LSHR T37.Y, T36.Z, literal.x,
 ; EG-NEXT:     LSHR * T36.W, T36.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T40.X, T36.Z, literal.x,
+; EG-NEXT:     AND_INT T37.X, T36.Z, literal.x,
 ; EG-NEXT:     AND_INT T36.Z, T36.Y, literal.x,
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
 ; EG-NEXT:     LSHR T41.X, PV.W, literal.x,
 ; EG-NEXT:     LSHR T36.Y, T36.X, literal.y,
-; EG-NEXT:     LSHR T42.W, T39.W, literal.y,
+; EG-NEXT:     LSHR T42.W, T40.W, literal.y,
 ; EG-NEXT:     AND_INT * T36.X, T36.X, literal.z,
 ; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T42.Z, T39.W, literal.x,
+; EG-NEXT:     AND_INT * T42.Z, T40.W, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
 ; EG-NEXT:     LSHR T43.X, KC0[2].Y, literal.x,
-; EG-NEXT:     LSHR T42.Y, T39.Z, literal.y,
-; EG-NEXT:     LSHR T39.W, T39.Y, literal.y,
-; EG-NEXT:     AND_INT * T42.X, T39.Z, literal.z,
+; EG-NEXT:     LSHR T42.Y, T40.Z, literal.y,
+; EG-NEXT:     LSHR T40.W, T40.Y, literal.y,
+; EG-NEXT:     AND_INT * T42.X, T40.Z, literal.z,
 ; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T39.Z, T39.Y, literal.x,
+; EG-NEXT:     AND_INT T40.Z, T40.Y, literal.x,
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
 ; EG-NEXT:     LSHR T44.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T39.Y, T39.X, literal.y,
-; EG-NEXT:     LSHR T45.W, T38.W, literal.y,
-; EG-NEXT:     AND_INT * T39.X, T39.X, literal.z,
+; EG-NEXT:     LSHR T40.Y, T40.X, literal.y,
+; EG-NEXT:     LSHR T45.W, T39.W, literal.y,
+; EG-NEXT:     AND_INT * T40.X, T40.X, literal.z,
 ; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T45.Z, T38.W, literal.x,
+; EG-NEXT:     AND_INT T45.Z, T39.W, literal.x,
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
 ; EG-NEXT:     LSHR T46.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T45.Y, T38.Z, literal.y,
-; EG-NEXT:     LSHR T38.W, T38.Y, literal.y,
-; EG-NEXT:     AND_INT * T45.X, T38.Z, literal.z,
+; EG-NEXT:     LSHR T45.Y, T39.Z, literal.y,
+; EG-NEXT:     LSHR T39.W, T39.Y, literal.y,
+; EG-NEXT:     AND_INT * T45.X, T39.Z, literal.z,
 ; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T38.Z, T38.Y, literal.x,
+; EG-NEXT:     AND_INT T39.Z, T39.Y, literal.x,
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 80(1.121039e-43)
 ; EG-NEXT:     LSHR T47.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T38.Y, T38.X, literal.y,
-; EG-NEXT:     AND_INT * T38.X, T38.X, literal.z,
+; EG-NEXT:     LSHR T39.Y, T39.X, literal.y,
+; EG-NEXT:     AND_INT * T39.X, T39.X, literal.z,
 ; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
 ; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:     LSHR * T35.W, T37.W, literal.y,
+; EG-NEXT:     LSHR * T35.W, T38.W, literal.y,
 ; EG-NEXT:    64(8.968310e-44), 16(2.242078e-44)
 ; EG-NEXT:     LSHR T48.X, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T35.Z, T37.W, literal.y,
+; EG-NEXT:     AND_INT * T35.Z, T38.W, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
 ; EG-NEXT:    ALU clause starting at 96:
-; EG-NEXT:     LSHR T35.Y, T37.Z, literal.x,
-; EG-NEXT:     LSHR * T37.W, T37.Y, literal.x,
+; EG-NEXT:     LSHR T35.Y, T38.Z, literal.x,
+; EG-NEXT:     LSHR * T38.W, T38.Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T35.X, T37.Z, literal.x,
-; EG-NEXT:     AND_INT T37.Z, T37.Y, literal.x,
+; EG-NEXT:     AND_INT T35.X, T38.Z, literal.x,
+; EG-NEXT:     AND_INT T38.Z, T38.Y, literal.x,
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 112(1.569454e-43)
 ; EG-NEXT:     LSHR T53.X, PV.W, literal.x,
-; EG-NEXT:     LSHR T37.Y, T37.X, literal.y,
+; EG-NEXT:     LSHR T38.Y, T38.X, literal.y,
 ; EG-NEXT:     LSHR T54.W, T52.W, literal.y,
-; EG-NEXT:     AND_INT * T37.X, T37.X, literal.z,
+; EG-NEXT:     AND_INT * T38.X, T38.X, literal.z,
 ; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T54.Z, T52.W, literal.x,
@@ -4279,102 +4279,102 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[36:39], off, s[8:11], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v19
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v18
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v19, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v18, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v11
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v10
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v11, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v10, 0, 16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v39, 16, v17
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v37, 16, v16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v38, v17, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v36, v16, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v19, 16, v23
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v17, 16, v22
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v18, v23, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v16, v22, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v43, 16, v21
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v41, 16, v20
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v42, v21, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v40, v20, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v23, 16, v27
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v21, 16, v26
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v22, v27, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v20, v26, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v47, 16, v25
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v45, 16, v24
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v46, v25, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v44, v24, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v27, 16, v31
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v25, 16, v30
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v26, v31, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v24, v30, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v51, 16, v29
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v49, 16, v28
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v50, v29, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v48, v28, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v31, 16, v15
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v29, 16, v14
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v30, v15, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v28, v14, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v55, 16, v13
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v53, 16, v12
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v54, v13, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v52, v12, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v15, 16, v11
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v13, 16, v10
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v14, v11, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v12, v10, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v59, 16, v9
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v57, 16, v8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v58, v9, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v56, v8, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v7
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v6
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v10, v7, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v8, v6, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v63, 16, v5
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v61, 16, v4
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v62, v5, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v60, v4, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v35
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v34
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v6, v35, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v4, v34, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v9
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v6, v9, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v4, v8, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v31
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v30
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v10, v31, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v8, v30, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v43, 16, v29
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v41, 16, v28
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v42, v29, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v40, v28, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v31, 16, v35
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v29, 16, v34
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v30, v35, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v28, v34, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v47, 16, v33
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v45, 16, v32
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v46, v33, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v44, v32, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v35, 16, v39
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v33, 16, v38
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v34, v39, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v32, v38, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v51, 16, v37
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v49, 16, v36
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v50, v37, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v48, v36, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v39, 16, v27
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v37, 16, v26
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v38, v27, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v36, v26, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v55, 16, v25
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v53, 16, v24
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v54, v25, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v52, v24, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v26, 16, v23
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v24, 16, v22
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v25, v23, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v23, v22, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v59, 16, v21
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v57, 16, v20
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v58, v21, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v56, v20, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v22, 16, v19
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v20, 16, v18
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v21, v19, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v19, v18, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v63, 16, v17
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v61, 16, v16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v62, v17, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v60, v16, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v18, 16, v15
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v16, 16, v14
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v17, v15, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v14, 0, 16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v33
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v32
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v33, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v32, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v13
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v12
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v13, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v12, 0, 16
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:208
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
@@ -4386,28 +4386,28 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    s_movk_i32 s8, 0x70
-; GCN-HSA-NEXT:    s_movk_i32 s9, 0x60
-; GCN-HSA-NEXT:    s_movk_i32 s10, 0x50
+; GCN-HSA-NEXT:    s_movk_i32 s9, 0x70
+; GCN-HSA-NEXT:    s_movk_i32 s10, 0x60
+; GCN-HSA-NEXT:    s_movk_i32 s8, 0x50
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_add_u32 s4, s2, s8
+; GCN-HSA-NEXT:    s_add_u32 s4, s2, s9
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s2, s9
+; GCN-HSA-NEXT:    s_add_u32 s4, s2, s10
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; GCN-HSA-NEXT:    s_add_u32 s4, s2, s8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
-; GCN-HSA-NEXT:    s_add_u32 s4, s2, s10
-; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s4
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 64
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    s_add_u32 s6, s2, 48
@@ -4433,79 +4433,79 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v27, 16, v1
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v25, 16, v0
-; GCN-HSA-NEXT:    v_bfe_i32 v26, v1, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v24, v0, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v27, 16, v13
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v25, 16, v12
+; GCN-HSA-NEXT:    v_bfe_i32 v26, v13, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v24, v12, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[24:27]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v27, 16, v3
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v25, 16, v2
-; GCN-HSA-NEXT:    v_bfe_i32 v26, v3, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v24, v2, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v27, 16, v15
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v25, 16, v14
+; GCN-HSA-NEXT:    v_bfe_i32 v26, v15, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v24, v14, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[24:27]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v5
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v4
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v5, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v4, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v15, 16, v9
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 16, v8
+; GCN-HSA-NEXT:    v_bfe_i32 v14, v9, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v12, v8, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xd0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v7
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v6
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v7, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v6, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[12:15]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v14, 16, v11
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v12, 16, v10
+; GCN-HSA-NEXT:    v_bfe_i32 v13, v11, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v11, v10, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[11:14]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v13
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v12
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v13, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v12, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s0
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 16, v4
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 16, v5
+; GCN-HSA-NEXT:    v_bfe_i32 v10, v5, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v8, v4, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v15, 16, v7
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 16, v6
+; GCN-HSA-NEXT:    v_bfe_i32 v14, v7, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v12, v6, 0, 16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[8:11]
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
+; GCN-HSA-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v4, v0, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
+; GCN-HSA-NEXT:    v_bfe_i32 v10, v3, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v9
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v8
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v9, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v8, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s5
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 16, v15
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 16, v14
-; GCN-HSA-NEXT:    v_bfe_i32 v6, v15, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v4, v14, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v14, 16, v11
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v12, 16, v10
-; GCN-HSA-NEXT:    v_bfe_i32 v13, v11, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v11, v10, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[11:14]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(11)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v17
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v16
@@ -4515,7 +4515,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, s9
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, s10
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v19
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v18
 ; GCN-HSA-NEXT:    v_bfe_i32 v2, v19, 0, 16
@@ -4524,7 +4524,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, s8
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, s9
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
@@ -4537,7 +4537,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    v_bfe_i32 v10, v23, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v8, v22, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, s10
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, s8
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(12)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v15, 16, v29
@@ -4594,81 +4594,87 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s3
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:48
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(6)
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v59, 16, v1
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v57, 16, v0
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v59, 16, v13
+; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(5)
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v55, 16, v17
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(4)
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v51, 16, v9
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v51, 16, v21
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v31, 16, v15
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v29, 16, v14
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v30, v15, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v28, v14, 0, 16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v28, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v11
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 16, v10
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v11, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v10, 0, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v29, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v30, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v31, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v9
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 16, v8
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v9, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v8, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v11, 16, v27
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v9, 16, v26
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v27, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v26, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v39, 16, v25
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v37, 16, v24
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v38, v25, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v36, v24, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v27, 16, v31
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v25, 16, v30
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v26, v31, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v24, v30, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v43, 16, v29
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v41, 16, v28
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v42, v29, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v40, v28, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v31, 16, v35
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v29, 16, v34
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v30, v35, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v28, v34, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v47, 16, v33
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v45, 16, v32
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v46, v33, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v44, v32, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v35, 16, v23
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v33, 16, v22
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v34, v23, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v32, v22, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v49, 16, v20
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v50, v21, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v48, v20, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v22, 16, v19
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v20, 16, v18
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v21, v19, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v19, v18, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v53, 16, v16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v54, v17, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v52, v16, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v18, 16, v15
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 16, v14
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v17, v15, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v14, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v57, 16, v12
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v58, v13, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v56, v12, 0, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[60:63], off, s[8:11], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v31, 16, v19
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v29, 16, v18
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v30, v19, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v28, v18, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v39, 16, v17
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v37, 16, v16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v38, v17, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v36, v16, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v19, 16, v23
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v17, 16, v22
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v18, v23, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v22, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v43, 16, v21
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v41, 16, v20
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v42, v21, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v40, v20, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v23, 16, v27
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v21, 16, v26
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v22, v27, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v20, v26, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v47, 16, v25
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v45, 16, v24
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v46, v25, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v44, v24, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v27, 16, v11
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v25, 16, v10
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v26, v11, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v24, v10, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v49, 16, v8
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v50, v9, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v48, v8, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v3, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v2, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v58, v1, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v56, v0, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v35, 16, v13
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v33, 16, v12
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v34, v13, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v32, v12, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v15, 16, v7
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v13, 16, v6
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v7, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v12, v6, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v55, 16, v5
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v53, 16, v4
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v54, v5, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v52, v4, 0, 16
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v61
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 16, v60
@@ -4681,18 +4687,23 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:208
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:144
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
@@ -4914,9 +4925,9 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; CM-NEXT:    ALU 82, @57, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    ALU 72, @140, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T65, T66.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T36, T37.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T35, T36.X
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T64, T56.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T35, T55.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T37, T55.X
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T63, T54.X
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T45, T53.X
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T62, T52.X
@@ -4932,17 +4943,17 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
 ; CM-NEXT:    Fetch clause starting at 24:
-; CM-NEXT:     VTX_READ_128 T36.XYZW, T35.X, 16, #1
-; CM-NEXT:     VTX_READ_128 T37.XYZW, T35.X, 0, #1
+; CM-NEXT:     VTX_READ_128 T35.XYZW, T37.X, 16, #1
+; CM-NEXT:     VTX_READ_128 T36.XYZW, T37.X, 0, #1
 ; CM-NEXT:    Fetch clause starting at 28:
-; CM-NEXT:     VTX_READ_128 T41.XYZW, T35.X, 112, #1
-; CM-NEXT:     VTX_READ_128 T42.XYZW, T35.X, 96, #1
-; CM-NEXT:     VTX_READ_128 T43.XYZW, T35.X, 80, #1
-; CM-NEXT:     VTX_READ_128 T44.XYZW, T35.X, 64, #1
-; CM-NEXT:     VTX_READ_128 T45.XYZW, T35.X, 48, #1
-; CM-NEXT:     VTX_READ_128 T35.XYZW, T35.X, 32, #1
+; CM-NEXT:     VTX_READ_128 T41.XYZW, T37.X, 112, #1
+; CM-NEXT:     VTX_READ_128 T42.XYZW, T37.X, 96, #1
+; CM-NEXT:     VTX_READ_128 T43.XYZW, T37.X, 80, #1
+; CM-NEXT:     VTX_READ_128 T44.XYZW, T37.X, 64, #1
+; CM-NEXT:     VTX_READ_128 T45.XYZW, T37.X, 48, #1
+; CM-NEXT:     VTX_READ_128 T37.XYZW, T37.X, 32, #1
 ; CM-NEXT:    ALU clause starting at 40:
-; CM-NEXT:     MOV * T35.X, KC0[2].Z,
+; CM-NEXT:     MOV * T37.X, KC0[2].Z,
 ; CM-NEXT:    ALU clause starting at 41:
 ; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
 ; CM-NEXT:    224(3.138909e-43), 0(0.000000e+00)
@@ -4950,34 +4961,34 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 240(3.363116e-43)
 ; CM-NEXT:     LSHR T39.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T0.Y, T37.Z, literal.y,
-; CM-NEXT:     LSHR T0.Z, T37.W, literal.y,
+; CM-NEXT:     LSHR T0.Y, T36.Z, literal.y,
+; CM-NEXT:     LSHR T0.Z, T36.W, literal.y,
 ; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
 ; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
 ; CM-NEXT:    192(2.690493e-43), 0(0.000000e+00)
 ; CM-NEXT:     LSHR T40.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T1.Y, T37.Y, literal.y,
-; CM-NEXT:     LSHR T1.Z, T36.Z, literal.y,
-; CM-NEXT:     LSHR * T0.W, T36.W, literal.y,
+; CM-NEXT:     LSHR T1.Y, T36.Y, literal.y,
+; CM-NEXT:     LSHR T1.Z, T35.Z, literal.y,
+; CM-NEXT:     LSHR * T0.W, T35.W, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
 ; CM-NEXT:    ALU clause starting at 57:
-; CM-NEXT:     LSHR T2.Z, T36.X, literal.x,
+; CM-NEXT:     LSHR T2.Z, T35.X, literal.x,
 ; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
 ; CM-NEXT:    16(2.242078e-44), 208(2.914701e-43)
 ; CM-NEXT:     LSHR T46.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T2.Y, T36.Y, literal.y,
-; CM-NEXT:     LSHR T3.Z, T35.Z, literal.y,
+; CM-NEXT:     LSHR T2.Y, T35.Y, literal.y,
+; CM-NEXT:     LSHR T3.Z, T37.Z, literal.y,
 ; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
 ; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
 ; CM-NEXT:    160(2.242078e-43), 0(0.000000e+00)
 ; CM-NEXT:     LSHR T47.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T3.Y, T35.W, literal.y,
-; CM-NEXT:     LSHR T4.Z, T35.X, literal.y,
+; CM-NEXT:     LSHR T3.Y, T37.W, literal.y,
+; CM-NEXT:     LSHR T4.Z, T37.X, literal.y,
 ; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
 ; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
 ; CM-NEXT:    176(2.466285e-43), 0(0.000000e+00)
 ; CM-NEXT:     LSHR T48.X, PV.W, literal.x,
-; CM-NEXT:     LSHR T4.Y, T35.Y, literal.y,
+; CM-NEXT:     LSHR T4.Y, T37.Y, literal.y,
 ; CM-NEXT:     LSHR T5.Z, T45.Z, literal.y,
 ; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
 ; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
@@ -5077,41 +5088,41 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     BFE_INT T62.X, T45.Z, 0.0, literal.x,
 ; CM-NEXT:     BFE_INT T61.Y, T7.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     BFE_INT T45.Z, T35.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T45.Z, T37.Y, 0.0, literal.x,
 ; CM-NEXT:     BFE_INT * T44.W, T6.Y, 0.0, literal.x, BS:VEC_120/SCL_212
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T45.X, T35.X, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T45.X, T37.X, 0.0, literal.x,
 ; CM-NEXT:     BFE_INT T44.Y, T6.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T63.Z, T35.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T63.Z, T37.W, 0.0, literal.x,
 ; CM-NEXT:     BFE_INT * T62.W, T5.Y, 0.0, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T63.X, T35.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T63.X, T37.Z, 0.0, literal.x,
 ; CM-NEXT:     BFE_INT T62.Y, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     BFE_INT T35.Z, T36.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T37.Z, T35.Y, 0.0, literal.x,
 ; CM-NEXT:     BFE_INT * T45.W, T4.Y, 0.0, literal.x, BS:VEC_120/SCL_212
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T35.X, T36.X, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T37.X, T35.X, 0.0, literal.x,
 ; CM-NEXT:     BFE_INT T45.Y, T4.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T64.Z, T36.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T64.Z, T35.W, 0.0, literal.x,
 ; CM-NEXT:     BFE_INT * T63.W, T3.Y, 0.0, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T64.X, T36.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T64.X, T35.Z, 0.0, literal.x,
 ; CM-NEXT:     BFE_INT T63.Y, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     BFE_INT T36.Z, T37.Y, 0.0, literal.x,
-; CM-NEXT:     BFE_INT * T35.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT T35.Z, T36.Y, 0.0, literal.x,
+; CM-NEXT:     BFE_INT * T37.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T36.X, T37.X, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T35.Y, T2.Z, 0.0, literal.x,
-; CM-NEXT:     BFE_INT T65.Z, T37.W, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T35.X, T36.X, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T37.Y, T2.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T65.Z, T36.W, 0.0, literal.x,
 ; CM-NEXT:     BFE_INT * T64.W, T0.W, 0.0, literal.x, BS:VEC_120/SCL_212
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     BFE_INT T65.X, T37.Z, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T65.X, T36.Z, 0.0, literal.x,
 ; CM-NEXT:     BFE_INT T64.Y, T1.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     LSHR T1.Z, T37.X, literal.x,
-; CM-NEXT:     BFE_INT * T36.W, T1.Y, 0.0, literal.x,
+; CM-NEXT:     LSHR T1.Z, T36.X, literal.x,
+; CM-NEXT:     BFE_INT * T35.W, T1.Y, 0.0, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T37.X, KC0[2].Y, literal.x,
-; CM-NEXT:     BFE_INT T36.Y, PV.Z, 0.0, literal.y,
+; CM-NEXT:     LSHR T36.X, KC0[2].Y, literal.x,
+; CM-NEXT:     BFE_INT T35.Y, PV.Z, 0.0, literal.y,
 ; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.y,
 ; CM-NEXT:     BFE_INT * T65.W, T0.Z, 0.0, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
@@ -6042,15 +6053,15 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)
 define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_v8i16_to_v8i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s12, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, 0
@@ -6060,8 +6071,8 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, v9
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v9
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v9
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
@@ -6071,10 +6082,10 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, s12, v2
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, s12, v1
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, s12, v3
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64:
@@ -6935,10 +6946,10 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[2:3]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
@@ -6956,63 +6967,63 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
-; GCN-HSA-NEXT:    v_ashr_i64 v[10:11], v[0:1], 48
-; GCN-HSA-NEXT:    v_bfe_i32 v8, v1, 0, 16
+; GCN-HSA-NEXT:    v_ashr_i64 v[10:11], v[4:5], 48
+; GCN-HSA-NEXT:    v_bfe_i32 v8, v5, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, v3
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
-; GCN-HSA-NEXT:    v_bfe_i32 v8, v1, 0, 16
-; GCN-HSA-NEXT:    v_ashr_i64 v[10:11], v[2:3], 48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, v7
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; GCN-HSA-NEXT:    v_bfe_i32 v8, v5, 0, 16
+; GCN-HSA-NEXT:    v_ashr_i64 v[10:11], v[6:7], 48
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GCN-HSA-NEXT:    v_bfe_i32 v8, v2, 0, 16
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s2
+; GCN-HSA-NEXT:    v_bfe_i32 v7, v5, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s3
+; GCN-HSA-NEXT:    v_bfe_i32 v5, v6, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s2
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[9:10], v[5:8]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT:    v_bfe_i32 v10, v1, 0, 16
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GCN-HSA-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v6, v6, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
-; GCN-HSA-NEXT:    v_ashr_i64 v[2:3], v[4:5], 48
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v5, 0, 16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 64
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, v7
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
+; GCN-HSA-NEXT:    v_ashr_i64 v[6:7], v[0:1], 48
+; GCN-HSA-NEXT:    v_bfe_i32 v4, v1, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, v3
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[4:7]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GCN-HSA-NEXT:    v_bfe_i32 v4, v0, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v10, v8, 0, 16
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v4, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v8, v6, 0, 16
-; GCN-HSA-NEXT:    v_ashr_i64 v[6:7], v[6:7], 48
-; GCN-HSA-NEXT:    v_bfe_i32 v4, v11, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; GCN-HSA-NEXT:    v_ashr_i64 v[2:3], v[2:3], 48
+; GCN-HSA-NEXT:    v_bfe_i32 v0, v11, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s1
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v1, 0, 16
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GCN-HSA-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[4:7]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[0:3]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[4:7]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_sextload_v16i16_to_v16i64:
@@ -7568,21 +7579,20 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[1:4], off, s[8:11], 0
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[5:8], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:48
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, 0xffff
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v57, 0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v15, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v27, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v23, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v19, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v16, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v24, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v20, 0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, s0, v1
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, s0, v2
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v9, s0, v1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v48, s0, v36
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v36
@@ -7591,14 +7601,13 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v36, s0, v37
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v37
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v37, 0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, s0, v0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v1, s0, v3
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, s0, v2
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, s0, v5
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, s0, v4
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v4
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, s0, v7
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v13, s0, v4
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v17, s0, v3
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, s0, v6
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v21, s0, v5
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v25, s0, v8
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, s0, v7
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v39, s0, v32
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v32
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v42, s0, v31
@@ -7618,45 +7627,47 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:208
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v46, v37
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v48, 0
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v7
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v44, 16, v31
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:160
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v43, v37
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v45, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, v37
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v2
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v31, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v29, v37
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v8
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v35, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v33, v37
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:128
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v40, v37
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v42, 0
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v31, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v29, v37
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, v37
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v13, v37
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v9, v37
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v28, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v26, v37
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v14, v37
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v10, v37
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, v37
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v25, v37
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v39, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v21, v37
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v17, v37
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v22, v37
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, v37
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v18, v37
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[1:4], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[9:12], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: global_zextload_v32i16_to_v32i64:
@@ -8002,45 +8013,45 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, v3
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, v15
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v16, v16, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[18:19], v[2:3], 48
+; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[18:19], v[14:15], 48
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[18:19], v[0:1], 48
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v16, v1, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[17:18], v[12:13], 48
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v13, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:208
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(4)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v3
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v16, v1, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[18:19], v[6:7], 48
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v13, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[17:18], v[2:3], 48
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[18:19], v[4:5], 48
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v16, v5, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[17:18], v[0:1], 48
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v1, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(4)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v15
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v7
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v16, v1, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[18:19], v[14:15], 48
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v1, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[17:18], v[6:7], 48
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[17:18], v[12:13], 48
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v13, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[17:18], v[4:5], 48
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v5, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v11
@@ -8054,48 +8065,49 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v9, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v14
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v17, v1, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v2, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v1, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v13, v14, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:224
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v1, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v12
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v10
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v12
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v11, v12, 0, 16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v5, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v6, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v13, v1, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v10
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v0, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v8, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v13, v3, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v11, v2, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:160
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v3, v1, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v1, v8, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v5, v10, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v7, v7, 0, 16
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v11, v9, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v9, v12, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v14
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v13, v14, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v1, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v17, v4, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v19, v1, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v9, v4, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v13, v6, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v2, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v17, v0, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v19, v2, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
@@ -8104,7 +8116,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[1:4], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64:
@@ -8114,11 +8126,11 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[2:3]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
@@ -8136,8 +8148,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s4
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
-; GCN-HSA-NEXT:    v_ashr_i64 v[18:19], v[0:1], 48
-; GCN-HSA-NEXT:    v_bfe_i32 v16, v1, 0, 16
+; GCN-HSA-NEXT:    v_ashr_i64 v[18:19], v[4:5], 48
+; GCN-HSA-NEXT:    v_bfe_i32 v16, v5, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s3
@@ -8152,93 +8164,93 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-NEXT:    s_addc_u32 s9, s1, 0
 ; GCN-HSA-NEXT:    s_add_u32 s10, s0, 0x70
 ; GCN-HSA-NEXT:    s_addc_u32 s11, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, v7
 ; GCN-HSA-NEXT:    s_add_u32 s12, s0, 0x50
-; GCN-HSA-NEXT:    v_bfe_i32 v16, v1, 0, 16
-; GCN-HSA-NEXT:    v_ashr_i64 v[18:19], v[2:3], 48
+; GCN-HSA-NEXT:    v_bfe_i32 v16, v5, 0, 16
+; GCN-HSA-NEXT:    v_ashr_i64 v[18:19], v[6:7], 48
 ; GCN-HSA-NEXT:    s_addc_u32 s13, s1, 0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
 ; GCN-HSA-NEXT:    s_add_u32 s14, s0, 32
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
 ; GCN-HSA-NEXT:    s_addc_u32 s15, s1, 0
-; GCN-HSA-NEXT:    v_bfe_i32 v18, v1, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v16, v2, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s14
+; GCN-HSA-NEXT:    v_bfe_i32 v18, v5, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v16, v6, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s14
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s15
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[1:2], v[16:19]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s15
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[5:6], v[16:19]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v1, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v6, v5, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[4:7]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s5
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(6)
-; GCN-HSA-NEXT:    v_ashr_i64 v[2:3], v[4:5], 48
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v5, 0, 16
+; GCN-HSA-NEXT:    v_ashr_i64 v[6:7], v[0:1], 48
+; GCN-HSA-NEXT:    v_bfe_i32 v4, v1, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s4
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, v3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v7
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GCN-HSA-NEXT:    v_ashr_i64 v[2:3], v[6:7], 48
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[4:7]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s2
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
+; GCN-HSA-NEXT:    v_bfe_i32 v4, v1, 0, 16
+; GCN-HSA-NEXT:    v_ashr_i64 v[6:7], v[2:3], 48
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[4:7]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s9
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
-; GCN-HSA-NEXT:    v_ashr_i64 v[2:3], v[8:9], 48
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v9, 0, 16
+; GCN-HSA-NEXT:    v_ashr_i64 v[5:6], v[8:9], 48
+; GCN-HSA-NEXT:    v_bfe_i32 v3, v9, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s8
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, v11
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v11
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GCN-HSA-NEXT:    v_ashr_i64 v[2:3], v[10:11], 48
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[3:6]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s6
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
+; GCN-HSA-NEXT:    v_bfe_i32 v3, v1, 0, 16
+; GCN-HSA-NEXT:    v_ashr_i64 v[5:6], v[10:11], 48
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[3:6]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s13
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
-; GCN-HSA-NEXT:    v_ashr_i64 v[2:3], v[12:13], 48
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v13, 0, 16
+; GCN-HSA-NEXT:    v_ashr_i64 v[5:6], v[12:13], 48
+; GCN-HSA-NEXT:    v_bfe_i32 v3, v13, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s12
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, v15
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s11
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v15
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GCN-HSA-NEXT:    v_ashr_i64 v[2:3], v[14:15], 48
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[3:6]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
+; GCN-HSA-NEXT:    v_bfe_i32 v3, v1, 0, 16
+; GCN-HSA-NEXT:    v_ashr_i64 v[5:6], v[14:15], 48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s10
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v6, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v5, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[3:6]
+; GCN-HSA-NEXT:    v_bfe_i32 v1, v2, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v3, v7, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[5:6], v[0:3]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v15, 16, v10
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v4, 0, 16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[6:7], v[1:4]
+; GCN-HSA-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v2, v5, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v7, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v15, 16, v10
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
@@ -8293,8 +8305,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s7
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, s2
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, s3
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
@@ -8311,7 +8323,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:240
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v18, v5, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v18, v1, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v14, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v12, 0, 16
@@ -8337,7 +8349,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v1, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v5, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v8
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v11, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v11, v8, 0, 16
@@ -8345,58 +8357,58 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v11, v9, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v8, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v8, v7
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v11, v6, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v8, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v7, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v8, v3
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v9, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v11, v2, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v6, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v4, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v0, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v0, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, v3
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v20, v0, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v12, v1, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v1, v4, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v3, v3, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v11, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v2, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v11, v8, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v2, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v2, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v11, v0, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, v7
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v7, v3, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v3, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v20, v3, 0, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v12, v5, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v5, v4, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v6, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[1:4], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[5:8], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index bebacfa27d15f..0b80b41703166 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -5,21 +5,21 @@
 define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_sdiv:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
 ; GCN-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s12, s3, 31
-; GCN-NEXT:    s_add_u32 s2, s2, s12
-; GCN-NEXT:    s_mov_b32 s13, s12
-; GCN-NEXT:    s_addc_u32 s3, s3, s12
-; GCN-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GCN-NEXT:    s_sub_u32 s4, 0, s2
-; GCN-NEXT:    s_subb_u32 s5, 0, s3
+; GCN-NEXT:    s_ashr_i32 s2, s5, 31
+; GCN-NEXT:    s_add_u32 s4, s4, s2
+; GCN-NEXT:    s_mov_b32 s3, s2
+; GCN-NEXT:    s_addc_u32 s5, s5, s2
+; GCN-NEXT:    s_xor_b64 s[12:13], s[4:5], s[2:3]
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s13
+; GCN-NEXT:    s_sub_u32 s4, 0, s12
+; GCN-NEXT:    s_subb_u32 s5, 0, s13
 ; GCN-NEXT:    s_ashr_i32 s14, s11, 31
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
@@ -96,23 +96,23 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s3, v0
-; GCN-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-NEXT:    v_mul_lo_u32 v2, s12, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s12, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s13, v0
+; GCN-NEXT:    v_mov_b32_e32 v5, s13
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_mul_lo_u32 v3, s2, v0
+; GCN-NEXT:    v_mul_lo_u32 v3, s12, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s11, v2
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
 ; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
-; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s2, v3
+; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s12, v3
 ; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v4
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v5
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v5
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v4
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
 ; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
 ; GCN-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
@@ -122,16 +122,16 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v6, s11
 ; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    s_xor_b64 s[0:1], s[14:15], s[12:13]
+; GCN-NEXT:    s_xor_b64 s[0:1], s[14:15], s[2:3]
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GCN-NEXT:    v_xor_b32_e32 v1, s1, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index 7272250c35356..045cedac614cd 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -971,30 +971,30 @@ define amdgpu_kernel void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> add
 ;
 ; VI-LABEL: shl_v4i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s19, 0xf000
-; VI-NEXT:    s_mov_b32 s18, -1
+; VI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s16, s8
-; VI-NEXT:    s_mov_b32 s17, s9
-; VI-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
-; VI-NEXT:    s_load_dwordx8 s[8:15], s[10:11], 0x20
+; VI-NEXT:    s_mov_b32 s0, s12
+; VI-NEXT:    s_mov_b32 s1, s13
+; VI-NEXT:    s_load_dwordx8 s[4:11], s[14:15], 0x0
+; VI-NEXT:    s_load_dwordx8 s[12:19], s[14:15], 0x20
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshl_b64 s[10:11], s[10:11], s18
+; VI-NEXT:    s_lshl_b64 s[8:9], s[8:9], s16
 ; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], s14
 ; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s12
-; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], s10
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v2, s10
+; VI-NEXT:    v_mov_b32_e32 v3, s11
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; VI-NEXT:    s_nop 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
-; VI-NEXT:    s_nop 0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: shl_v4i64:

diff  --git a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
index a2da6776fed11..52666fd9e5f02 100644
--- a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
+++ b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
@@ -1,6 +1,5 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck %s
 
-; FIXME: The wide loads and bundles introduce so much spilling.
 define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(float addrspace(4)* %wei_ptr, float addrspace(1)* %out_ptr, float addrspace(1)* %in) {
 ; CHECK-LABEL: excess_soft_clause_reg_pressure:
 ; CHECK:  BB0_1: ; %for.cond28.preheader
@@ -14,78 +13,15 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(float addrs
 ; CHECK-NEXT:    s_load_dwordx16
 ; CHECK-NEXT:    s_load_dwordx16
 
-; CHECK:         v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    s_load_dwordx16
-
-; CHECK:         v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-; CHECK-NEXT:    v_writelane_b32
-
-; CHECK:         v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
+; CHECK-NOT: v_writelane_b32
+; CHECK-NOT: v_readlane_b32
 
 ; CHECK:         s_load_dwordx16
 ; CHECK:         s_load_dwordx16
 ; CHECK:         s_load_dwordx16
 
-; CHECK:         v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
-; CHECK-NEXT:    v_readlane_b32
+; CHECK-NOT: v_writelane_b32
+; CHECK-NOT: v_readlane_b32
 entry:
   %i = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
   %i1 = bitcast i8 addrspace(4)* %i to i64 addrspace(4)*

diff  --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
index 8d700780046cf..8937a319e451f 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
@@ -38,40 +38,38 @@ body:             |
     ; CHECK: undef %62.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec
     ; CHECK: SI_SPILL_V128_SAVE %62, %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5)
     ; CHECK: undef %67.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec
-    ; CHECK: undef %71.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec
-    ; CHECK: SI_SPILL_V128_SAVE %71, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
-    ; CHECK: undef %76.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec
-    ; CHECK: SI_SPILL_V128_SAVE %76, %stack.5, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.5, align 4, addrspace 5)
-    ; CHECK: undef %81.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec
-    ; CHECK: SI_SPILL_V128_SAVE %81, %stack.6, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.6, align 4, addrspace 5)
-    ; CHECK: undef %86.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec
-    ; CHECK: undef %90.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec
-    ; CHECK: SI_SPILL_V128_SAVE %90, %stack.7, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.7, align 4, addrspace 5)
+    ; CHECK: SI_SPILL_V128_SAVE %67, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
+    ; CHECK: undef %72.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec
+    ; CHECK: SI_SPILL_V128_SAVE %72, %stack.5, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.5, align 4, addrspace 5)
+    ; CHECK: undef %77.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec
+    ; CHECK: SI_SPILL_V128_SAVE %77, %stack.6, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.6, align 4, addrspace 5)
+    ; CHECK: undef %82.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec
+    ; CHECK: SI_SPILL_V128_SAVE %82, %stack.7, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.7, align 4, addrspace 5)
+    ; CHECK: undef %87.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec
+    ; CHECK: undef %91.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec
     ; CHECK: undef %95.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec
     ; CHECK: SI_SPILL_V128_SAVE %95, %stack.8, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.8, align 4, addrspace 5)
-    ; CHECK: undef %100.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec
-    ; CHECK: SI_SPILL_V128_SAVE %100, %stack.9, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.9, align 4, addrspace 5)
-    ; CHECK: undef %105.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec
-    ; CHECK: undef %109.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec
-    ; CHECK: undef %113.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec
-    ; CHECK: undef %117.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec
-    ; CHECK: SI_SPILL_V128_SAVE %117, %stack.10, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.10, align 4, addrspace 5)
+    ; CHECK: undef %19.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec
+    ; CHECK: undef %153.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec
+    ; CHECK: SI_SPILL_V128_SAVE %153, %stack.14, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.14, align 4, addrspace 5)
+    ; CHECK: undef %102.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec
+    ; CHECK: undef %106.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec
+    ; CHECK: SI_SPILL_V128_SAVE %106, %stack.9, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.9, align 4, addrspace 5)
+    ; CHECK: undef %111.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec
     ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 64, 0, 0, 0, implicit $exec :: (load (s128), align 64, addrspace 1)
-    ; CHECK: undef %122.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec
-    ; CHECK: undef %126.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec
-    ; CHECK: undef %130.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec
-    ; CHECK: SI_SPILL_V128_SAVE %130, %stack.11, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.11, align 4, addrspace 5)
-    ; CHECK: undef %135.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec
-    ; CHECK: SI_SPILL_V128_SAVE %135, %stack.12, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.12, align 4, addrspace 5)
+    ; CHECK: undef %115.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec
+    ; CHECK: undef %119.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec
+    ; CHECK: undef %123.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec
+    ; CHECK: undef %127.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec
+    ; CHECK: SI_SPILL_V128_SAVE %127, %stack.10, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.10, align 4, addrspace 5)
     ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 80, 0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
-    ; CHECK: undef %140.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
-    ; CHECK: undef %144.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
-    ; CHECK: SI_SPILL_V128_SAVE %144, %stack.13, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.13, align 4, addrspace 5)
-    ; CHECK: undef %149.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
-    ; CHECK: SI_SPILL_V128_SAVE %149, %stack.14, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.14, align 4, addrspace 5)
-    ; CHECK: undef %154.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
+    ; CHECK: undef %138.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
+    ; CHECK: undef %142.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
+    ; CHECK: undef %146.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
+    ; CHECK: undef %150.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
+    ; CHECK: SI_SPILL_V128_SAVE %150, %stack.13, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.13, align 4, addrspace 5)
     ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 96, 0, 0, 0, implicit $exec :: (load (s128), align 32, addrspace 1)
-    ; CHECK: undef %158.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
+    ; CHECK: undef %156.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
     ; CHECK: undef %36.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec
     ; CHECK: undef %37.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec
     ; CHECK: undef %38.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec
@@ -93,59 +91,61 @@ body:             |
     ; CHECK: [[SI_SPILL_V128_RESTORE3:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5)
     ; CHECK: [[SI_SPILL_V128_RESTORE3]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec
     ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE3]], %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5)
-    ; CHECK: undef %68.sub2:vreg_128 = COPY %67.sub2
-    ; CHECK: %68.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec
     ; CHECK: [[SI_SPILL_V128_RESTORE4:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
-    ; CHECK: [[SI_SPILL_V128_RESTORE4]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec
+    ; CHECK: [[SI_SPILL_V128_RESTORE4]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec
     ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE4]], %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
     ; CHECK: [[SI_SPILL_V128_RESTORE5:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.5, align 4, addrspace 5)
-    ; CHECK: [[SI_SPILL_V128_RESTORE5]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec
+    ; CHECK: [[SI_SPILL_V128_RESTORE5]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec
     ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE5]], %stack.5, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.5, align 4, addrspace 5)
     ; CHECK: [[SI_SPILL_V128_RESTORE6:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.6, align 4, addrspace 5)
-    ; CHECK: [[SI_SPILL_V128_RESTORE6]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec
+    ; CHECK: [[SI_SPILL_V128_RESTORE6]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec
     ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE6]], %stack.6, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.6, align 4, addrspace 5)
-    ; CHECK: undef %87.sub2:vreg_128 = COPY %86.sub2
-    ; CHECK: %87.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec
     ; CHECK: [[SI_SPILL_V128_RESTORE7:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.7, align 4, addrspace 5)
-    ; CHECK: [[SI_SPILL_V128_RESTORE7]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec
+    ; CHECK: [[SI_SPILL_V128_RESTORE7]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec
     ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE7]], %stack.7, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.7, align 4, addrspace 5)
-    ; CHECK: [[SI_SPILL_V128_RESTORE8:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.8, align 4, addrspace 5)
-    ; CHECK: [[SI_SPILL_V128_RESTORE8]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec
-    ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE8]], %stack.8, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.8, align 4, addrspace 5)
-    ; CHECK: [[SI_SPILL_V128_RESTORE9:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.9, align 4, addrspace 5)
-    ; CHECK: [[SI_SPILL_V128_RESTORE9]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec
-    ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE9]], %stack.9, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.9, align 4, addrspace 5)
-    ; CHECK: undef %106.sub2:vreg_128 = COPY %105.sub2
-    ; CHECK: %106.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec
-    ; CHECK: undef %110.sub2:vreg_128 = COPY %109.sub2
-    ; CHECK: %110.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec
-    ; CHECK: undef %114.sub2:vreg_128 = COPY %113.sub2
-    ; CHECK: %114.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec
-    ; CHECK: [[SI_SPILL_V128_RESTORE10:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.10, align 4, addrspace 5)
-    ; CHECK: [[SI_SPILL_V128_RESTORE10]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec
-    ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE10]], %stack.10, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.10, align 4, addrspace 5)
-    ; CHECK: undef %123.sub2:vreg_128 = COPY %122.sub2
-    ; CHECK: %123.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec
-    ; CHECK: undef %127.sub2:vreg_128 = COPY %126.sub2
-    ; CHECK: %127.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec
-    ; CHECK: [[SI_SPILL_V128_RESTORE11:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.11, align 4, addrspace 5)
-    ; CHECK: [[SI_SPILL_V128_RESTORE11]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec
-    ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE11]], %stack.11, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.11, align 4, addrspace 5)
-    ; CHECK: [[SI_SPILL_V128_RESTORE12:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.12, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.12, align 4, addrspace 5)
-    ; CHECK: [[SI_SPILL_V128_RESTORE12]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec
-    ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE12]], %stack.12, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.12, align 4, addrspace 5)
-    ; CHECK: undef %141.sub2:vreg_128 = COPY %140.sub2
-    ; CHECK: %141.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
-    ; CHECK: [[SI_SPILL_V128_RESTORE13:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.13, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.13, align 4, addrspace 5)
-    ; CHECK: [[SI_SPILL_V128_RESTORE13]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
-    ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE13]], %stack.13, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.13, align 4, addrspace 5)
-    ; CHECK: [[SI_SPILL_V128_RESTORE14:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.14, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.14, align 4, addrspace 5)
-    ; CHECK: [[SI_SPILL_V128_RESTORE14]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
-    ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE14]], %stack.14, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.14, align 4, addrspace 5)
-    ; CHECK: undef %155.sub2:vreg_128 = COPY %154.sub2
-    ; CHECK: %155.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
-    ; CHECK: undef %159.sub2:vreg_128 = COPY %158.sub2
-    ; CHECK: %159.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
+    ; CHECK: undef %131.sub2:vreg_128 = COPY %87.sub2
+    ; CHECK: SI_SPILL_V128_SAVE %131, %stack.11, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.11, align 4, addrspace 5)
+    ; CHECK: [[SI_SPILL_V128_RESTORE8:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.11, align 4, addrspace 5)
+    ; CHECK: [[SI_SPILL_V128_RESTORE8]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec
+    ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE8]], %stack.11, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.11, align 4, addrspace 5)
+    ; CHECK: undef %134.sub2:vreg_128 = COPY %91.sub2
+    ; CHECK: SI_SPILL_V128_SAVE %134, %stack.12, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.12, align 4, addrspace 5)
+    ; CHECK: [[SI_SPILL_V128_RESTORE9:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.12, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.12, align 4, addrspace 5)
+    ; CHECK: [[SI_SPILL_V128_RESTORE9]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec
+    ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE9]], %stack.12, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.12, align 4, addrspace 5)
+    ; CHECK: [[SI_SPILL_V128_RESTORE10:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.8, align 4, addrspace 5)
+    ; CHECK: [[SI_SPILL_V128_RESTORE10]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec
+    ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE10]], %stack.8, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.8, align 4, addrspace 5)
+    ; CHECK: %19.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec
+    ; CHECK: [[SI_SPILL_V128_RESTORE11:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.14, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.14, align 4, addrspace 5)
+    ; CHECK: [[SI_SPILL_V128_RESTORE11]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec
+    ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE11]], %stack.14, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.14, align 4, addrspace 5)
+    ; CHECK: undef %103.sub2:vreg_128 = COPY %102.sub2
+    ; CHECK: %103.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec
+    ; CHECK: [[SI_SPILL_V128_RESTORE12:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.9, align 4, addrspace 5)
+    ; CHECK: [[SI_SPILL_V128_RESTORE12]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec
+    ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE12]], %stack.9, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.9, align 4, addrspace 5)
+    ; CHECK: undef %112.sub2:vreg_128 = COPY %111.sub2
+    ; CHECK: %112.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec
+    ; CHECK: undef %116.sub2:vreg_128 = COPY %115.sub2
+    ; CHECK: %116.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec
+    ; CHECK: undef %120.sub2:vreg_128 = COPY %119.sub2
+    ; CHECK: %120.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec
+    ; CHECK: undef %124.sub2:vreg_128 = COPY %123.sub2
+    ; CHECK: %124.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec
+    ; CHECK: [[SI_SPILL_V128_RESTORE13:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.10, align 4, addrspace 5)
+    ; CHECK: [[SI_SPILL_V128_RESTORE13]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec
+    ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE13]], %stack.10, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.10, align 4, addrspace 5)
+    ; CHECK: undef %139.sub2:vreg_128 = COPY %138.sub2
+    ; CHECK: %139.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
+    ; CHECK: undef %143.sub2:vreg_128 = COPY %142.sub2
+    ; CHECK: %143.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
+    ; CHECK: undef %147.sub2:vreg_128 = COPY %146.sub2
+    ; CHECK: %147.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
+    ; CHECK: [[SI_SPILL_V128_RESTORE14:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.13, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.13, align 4, addrspace 5)
+    ; CHECK: [[SI_SPILL_V128_RESTORE14]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
+    ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE14]], %stack.13, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.13, align 4, addrspace 5)
+    ; CHECK: %156.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
     ; CHECK: %36.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec
     ; CHECK: %37.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec
     ; CHECK: %38.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec
@@ -174,139 +174,136 @@ body:             |
     ; CHECK: %36.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %36.sub3:vreg_128 = COPY %43.sub1
     ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %36, %2, 0, 384, 0, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
-    ; CHECK: undef %157.sub0:vreg_128 = COPY %159.sub0 {
-    ; CHECK:   internal %157.sub2:vreg_128 = COPY %159.sub2
+    ; CHECK: undef %157.sub0:vreg_128 = COPY %156.sub0 {
+    ; CHECK:   internal %157.sub2:vreg_128 = COPY %156.sub2
     ; CHECK: }
     ; CHECK: %157.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %157.sub3:vreg_128 = COPY %43.sub1
     ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %157, %2, 0, 400, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK: undef %153.sub0:vreg_128 = COPY %155.sub0 {
-    ; CHECK:   internal %153.sub2:vreg_128 = COPY %155.sub2
+    ; CHECK: [[SI_SPILL_V128_RESTORE15:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.13, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.13, align 4, addrspace 5)
+    ; CHECK: undef %149.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE15]].sub0 {
+    ; CHECK:   internal %149.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE15]].sub2
     ; CHECK: }
-    ; CHECK: %153.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK: %153.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %153, %2, 0, 352, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK: [[SI_SPILL_V128_RESTORE15:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.14, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.14, align 4, addrspace 5)
-    ; CHECK: undef %148.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE15]].sub0 {
-    ; CHECK:   internal %148.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE15]].sub2
+    ; CHECK: %149.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK: %149.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %149, %2, 0, 352, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK: undef %145.sub0:vreg_128 = COPY %147.sub0 {
+    ; CHECK:   internal %145.sub2:vreg_128 = COPY %147.sub2
     ; CHECK: }
-    ; CHECK: %148.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK: %148.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %148, %2, 0, 368, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK: [[SI_SPILL_V128_RESTORE16:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.13, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.13, align 4, addrspace 5)
-    ; CHECK: undef %143.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE16]].sub0 {
-    ; CHECK:   internal %143.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE16]].sub2
+    ; CHECK: %145.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK: %145.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %145, %2, 0, 368, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK: undef %141.sub0:vreg_128 = COPY %143.sub0 {
+    ; CHECK:   internal %141.sub2:vreg_128 = COPY %143.sub2
     ; CHECK: }
-    ; CHECK: %143.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK: %143.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %143, %2, 0, 320, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
-    ; CHECK: undef %139.sub0:vreg_128 = COPY %141.sub0 {
-    ; CHECK:   internal %139.sub2:vreg_128 = COPY %141.sub2
+    ; CHECK: %141.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK: %141.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %141, %2, 0, 320, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
+    ; CHECK: undef %137.sub0:vreg_128 = COPY %139.sub0 {
+    ; CHECK:   internal %137.sub2:vreg_128 = COPY %139.sub2
     ; CHECK: }
-    ; CHECK: %139.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK: %139.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %139, %2, 0, 336, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK: [[SI_SPILL_V128_RESTORE17:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.12, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.12, align 4, addrspace 5)
-    ; CHECK: undef %134.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE17]].sub0 {
-    ; CHECK:   internal %134.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE17]].sub2
+    ; CHECK: %137.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK: %137.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %137, %2, 0, 336, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK: [[SI_SPILL_V128_RESTORE16:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.10, align 4, addrspace 5)
+    ; CHECK: undef %126.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE16]].sub0 {
+    ; CHECK:   internal %126.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE16]].sub2
     ; CHECK: }
-    ; CHECK: %134.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK: %134.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %134, %2, 0, 288, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK: [[SI_SPILL_V128_RESTORE18:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.11, align 4, addrspace 5)
-    ; CHECK: undef %129.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE18]].sub0 {
-    ; CHECK:   internal %129.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE18]].sub2
+    ; CHECK: %126.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK: %126.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %126, %2, 0, 288, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK: undef %122.sub0:vreg_128 = COPY %124.sub0 {
+    ; CHECK:   internal %122.sub2:vreg_128 = COPY %124.sub2
     ; CHECK: }
-    ; CHECK: %129.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK: %129.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %129, %2, 0, 304, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK: undef %125.sub0:vreg_128 = COPY %127.sub0 {
-    ; CHECK:   internal %125.sub2:vreg_128 = COPY %127.sub2
+    ; CHECK: %122.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK: %122.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %122, %2, 0, 304, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK: undef %118.sub0:vreg_128 = COPY %120.sub0 {
+    ; CHECK:   internal %118.sub2:vreg_128 = COPY %120.sub2
     ; CHECK: }
-    ; CHECK: %125.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK: %125.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %125, %2, 0, 256, 0, 0, 0, implicit $exec :: (store (s128), align 256, addrspace 1)
-    ; CHECK: undef %121.sub0:vreg_128 = COPY %123.sub0 {
-    ; CHECK:   internal %121.sub2:vreg_128 = COPY %123.sub2
+    ; CHECK: %118.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK: %118.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %118, %2, 0, 256, 0, 0, 0, implicit $exec :: (store (s128), align 256, addrspace 1)
+    ; CHECK: undef %114.sub0:vreg_128 = COPY %116.sub0 {
+    ; CHECK:   internal %114.sub2:vreg_128 = COPY %116.sub2
     ; CHECK: }
-    ; CHECK: %121.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK: %121.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %121, %2, 0, 272, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK: [[SI_SPILL_V128_RESTORE19:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.10, align 4, addrspace 5)
-    ; CHECK: undef %116.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE19]].sub0 {
-    ; CHECK:   internal %116.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE19]].sub2
+    ; CHECK: %114.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK: %114.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %114, %2, 0, 272, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK: undef %110.sub0:vreg_128 = COPY %112.sub0 {
+    ; CHECK:   internal %110.sub2:vreg_128 = COPY %112.sub2
     ; CHECK: }
-    ; CHECK: %116.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK: %116.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %116, %2, 0, 224, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK: undef %112.sub0:vreg_128 = COPY %114.sub0 {
-    ; CHECK:   internal %112.sub2:vreg_128 = COPY %114.sub2
+    ; CHECK: %110.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK: %110.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %110, %2, 0, 224, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK: [[SI_SPILL_V128_RESTORE17:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.9, align 4, addrspace 5)
+    ; CHECK: undef %105.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE17]].sub0 {
+    ; CHECK:   internal %105.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE17]].sub2
     ; CHECK: }
-    ; CHECK: %112.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK: %112.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %112, %2, 0, 240, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK: undef %108.sub0:vreg_128 = COPY %110.sub0 {
-    ; CHECK:   internal %108.sub2:vreg_128 = COPY %110.sub2
+    ; CHECK: %105.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK: %105.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %105, %2, 0, 240, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK: undef %101.sub0:vreg_128 = COPY %103.sub0 {
+    ; CHECK:   internal %101.sub2:vreg_128 = COPY %103.sub2
     ; CHECK: }
-    ; CHECK: %108.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK: %108.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %108, %2, 0, 192, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
-    ; CHECK: undef %104.sub0:vreg_128 = COPY %106.sub0 {
-    ; CHECK:   internal %104.sub2:vreg_128 = COPY %106.sub2
-    ; CHECK: }
-    ; CHECK: %104.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK: %104.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %104, %2, 0, 208, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK: [[SI_SPILL_V128_RESTORE20:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.9, align 4, addrspace 5)
-    ; CHECK: undef %99.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE20]].sub0 {
-    ; CHECK:   internal %99.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE20]].sub2
+    ; CHECK: %101.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK: %101.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %101, %2, 0, 192, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
+    ; CHECK: [[SI_SPILL_V128_RESTORE18:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.14, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.14, align 4, addrspace 5)
+    ; CHECK: undef %99.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE18]].sub0 {
+    ; CHECK:   internal %99.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE18]].sub2
     ; CHECK: }
     ; CHECK: %99.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %99.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %99, %2, 0, 160, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK: [[SI_SPILL_V128_RESTORE21:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.8, align 4, addrspace 5)
-    ; CHECK: undef %94.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE21]].sub0 {
-    ; CHECK:   internal %94.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE21]].sub2
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %99, %2, 0, 208, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK: %19.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK: %19.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %19, %2, 0, 160, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK: [[SI_SPILL_V128_RESTORE19:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.8, align 4, addrspace 5)
+    ; CHECK: undef %94.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE19]].sub0 {
+    ; CHECK:   internal %94.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE19]].sub2
     ; CHECK: }
     ; CHECK: %94.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %94.sub3:vreg_128 = COPY %43.sub1
     ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %94, %2, 0, 176, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK: [[SI_SPILL_V128_RESTORE22:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.7, align 4, addrspace 5)
-    ; CHECK: undef %89.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE22]].sub0 {
-    ; CHECK:   internal %89.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE22]].sub2
+    ; CHECK: [[SI_SPILL_V128_RESTORE20:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.12, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.12, align 4, addrspace 5)
+    ; CHECK: undef %90.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE20]].sub0 {
+    ; CHECK:   internal %90.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE20]].sub2
     ; CHECK: }
-    ; CHECK: %89.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK: %89.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %89, %2, 0, 128, 0, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
-    ; CHECK: undef %85.sub0:vreg_128 = COPY %87.sub0 {
-    ; CHECK:   internal %85.sub2:vreg_128 = COPY %87.sub2
+    ; CHECK: %90.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK: %90.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %90, %2, 0, 128, 0, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+    ; CHECK: [[SI_SPILL_V128_RESTORE21:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.11, align 4, addrspace 5)
+    ; CHECK: undef %86.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE21]].sub0 {
+    ; CHECK:   internal %86.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE21]].sub2
     ; CHECK: }
-    ; CHECK: %85.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK: %85.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %85, %2, 0, 144, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK: %86.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK: %86.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %86, %2, 0, 144, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK: [[SI_SPILL_V128_RESTORE22:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.7, align 4, addrspace 5)
+    ; CHECK: undef %81.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE22]].sub0 {
+    ; CHECK:   internal %81.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE22]].sub2
+    ; CHECK: }
+    ; CHECK: %81.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK: %81.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %81, %2, 0, 96, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
     ; CHECK: [[SI_SPILL_V128_RESTORE23:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.6, align 4, addrspace 5)
-    ; CHECK: undef %80.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE23]].sub0 {
-    ; CHECK:   internal %80.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE23]].sub2
+    ; CHECK: undef %76.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE23]].sub0 {
+    ; CHECK:   internal %76.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE23]].sub2
     ; CHECK: }
-    ; CHECK: %80.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK: %80.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %80, %2, 0, 96, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK: %76.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK: %76.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %76, %2, 0, 112, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK: [[SI_SPILL_V128_RESTORE24:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.5, align 4, addrspace 5)
-    ; CHECK: undef %75.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE24]].sub0 {
-    ; CHECK:   internal %75.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE24]].sub2
+    ; CHECK: undef %71.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE24]].sub0 {
+    ; CHECK:   internal %71.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE24]].sub2
     ; CHECK: }
-    ; CHECK: %75.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK: %75.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %75, %2, 0, 112, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK: %71.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK: %71.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %71, %2, 0, 64, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
     ; CHECK: [[SI_SPILL_V128_RESTORE25:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
-    ; CHECK: undef %70.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE25]].sub0 {
-    ; CHECK:   internal %70.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE25]].sub2
-    ; CHECK: }
-    ; CHECK: %70.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK: %70.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %70, %2, 0, 64, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
-    ; CHECK: undef %66.sub0:vreg_128 = COPY %68.sub0 {
-    ; CHECK:   internal %66.sub2:vreg_128 = COPY %68.sub2
+    ; CHECK: undef %66.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE25]].sub0 {
+    ; CHECK:   internal %66.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE25]].sub2
     ; CHECK: }
     ; CHECK: %66.sub1:vreg_128 = COPY %43.sub1
     ; CHECK: %66.sub3:vreg_128 = COPY %43.sub1

diff  --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll
index 43ecb0d9188ba..882d081d11ad4 100644
--- a/llvm/test/CodeGen/AMDGPU/srl.ll
+++ b/llvm/test/CodeGen/AMDGPU/srl.ll
@@ -295,30 +295,30 @@ define amdgpu_kernel void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> ad
 ;
 ; VI-LABEL: lshr_v4i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s19, 0xf000
-; VI-NEXT:    s_mov_b32 s18, -1
+; VI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s16, s8
-; VI-NEXT:    s_mov_b32 s17, s9
-; VI-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
-; VI-NEXT:    s_load_dwordx8 s[8:15], s[10:11], 0x20
+; VI-NEXT:    s_mov_b32 s0, s12
+; VI-NEXT:    s_mov_b32 s1, s13
+; VI-NEXT:    s_load_dwordx8 s[4:11], s[14:15], 0x0
+; VI-NEXT:    s_load_dwordx8 s[12:19], s[14:15], 0x20
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
+; VI-NEXT:    s_lshr_b64 s[8:9], s[8:9], s16
 ; VI-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
 ; VI-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
-; VI-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; VI-NEXT:    s_lshr_b64 s[0:1], s[0:1], s8
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v2, s10
+; VI-NEXT:    v_mov_b32_e32 v3, s11
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; VI-NEXT:    s_nop 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
-; VI-NEXT:    s_nop 0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: lshr_v4i64:

diff  --git a/llvm/test/CodeGen/ARM/fptosi-sat-scalar.ll b/llvm/test/CodeGen/ARM/fptosi-sat-scalar.ll
index bcbd88288c587..9544e395744a4 100644
--- a/llvm/test/CodeGen/ARM/fptosi-sat-scalar.ll
+++ b/llvm/test/CodeGen/ARM/fptosi-sat-scalar.ll
@@ -2500,54 +2500,54 @@ define i100 @test_signed_i100_f64(double %f) nounwind {
 ; FP16-NEXT:    mov r4, r1
 ; FP16-NEXT:    mov r5, r0
 ; FP16-NEXT:    bl __fixdfti
-; FP16-NEXT:    vldr d0, .LCPI18_0
-; FP16-NEXT:    vmov d1, r5, r4
-; FP16-NEXT:    vldr d2, .LCPI18_1
-; FP16-NEXT:    vcmp.f64 d1, d0
+; FP16-NEXT:    vldr d2, .LCPI18_0
+; FP16-NEXT:    vmov d0, r5, r4
+; FP16-NEXT:    vldr d1, .LCPI18_1
+; FP16-NEXT:    vcmp.f64 d0, d2
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it lt
 ; FP16-NEXT:    movlt r0, #0
-; FP16-NEXT:    vcmp.f64 d1, d2
+; FP16-NEXT:    vcmp.f64 d0, d1
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it gt
 ; FP16-NEXT:    movgt.w r0, #-1
-; FP16-NEXT:    vcmp.f64 d1, d1
+; FP16-NEXT:    vcmp.f64 d0, d0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it vs
 ; FP16-NEXT:    movvs r0, #0
-; FP16-NEXT:    vcmp.f64 d1, d0
+; FP16-NEXT:    vcmp.f64 d0, d2
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it lt
 ; FP16-NEXT:    movlt r1, #0
-; FP16-NEXT:    vcmp.f64 d1, d2
+; FP16-NEXT:    vcmp.f64 d0, d1
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it gt
 ; FP16-NEXT:    movgt.w r1, #-1
-; FP16-NEXT:    vcmp.f64 d1, d1
+; FP16-NEXT:    vcmp.f64 d0, d0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it vs
 ; FP16-NEXT:    movvs r1, #0
-; FP16-NEXT:    vcmp.f64 d1, d0
+; FP16-NEXT:    vcmp.f64 d0, d2
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it lt
 ; FP16-NEXT:    movlt r2, #0
-; FP16-NEXT:    vcmp.f64 d1, d2
+; FP16-NEXT:    vcmp.f64 d0, d1
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it gt
 ; FP16-NEXT:    movgt.w r2, #-1
-; FP16-NEXT:    vcmp.f64 d1, d1
+; FP16-NEXT:    vcmp.f64 d0, d0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it vs
 ; FP16-NEXT:    movvs r2, #0
-; FP16-NEXT:    vcmp.f64 d1, d0
+; FP16-NEXT:    vcmp.f64 d0, d2
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it lt
 ; FP16-NEXT:    mvnlt r3, #7
-; FP16-NEXT:    vcmp.f64 d1, d2
+; FP16-NEXT:    vcmp.f64 d0, d1
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it gt
 ; FP16-NEXT:    movgt r3, #7
-; FP16-NEXT:    vcmp.f64 d1, d1
+; FP16-NEXT:    vcmp.f64 d0, d0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it vs
 ; FP16-NEXT:    movvs r3, #0
@@ -2809,54 +2809,54 @@ define i128 @test_signed_i128_f64(double %f) nounwind {
 ; FP16-NEXT:    mov r4, r1
 ; FP16-NEXT:    mov r5, r0
 ; FP16-NEXT:    bl __fixdfti
-; FP16-NEXT:    vldr d0, .LCPI19_0
-; FP16-NEXT:    vmov d1, r5, r4
-; FP16-NEXT:    vldr d2, .LCPI19_1
-; FP16-NEXT:    vcmp.f64 d1, d0
+; FP16-NEXT:    vldr d2, .LCPI19_0
+; FP16-NEXT:    vmov d0, r5, r4
+; FP16-NEXT:    vldr d1, .LCPI19_1
+; FP16-NEXT:    vcmp.f64 d0, d2
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it lt
 ; FP16-NEXT:    movlt r0, #0
-; FP16-NEXT:    vcmp.f64 d1, d2
+; FP16-NEXT:    vcmp.f64 d0, d1
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it gt
 ; FP16-NEXT:    movgt.w r0, #-1
-; FP16-NEXT:    vcmp.f64 d1, d1
+; FP16-NEXT:    vcmp.f64 d0, d0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it vs
 ; FP16-NEXT:    movvs r0, #0
-; FP16-NEXT:    vcmp.f64 d1, d0
+; FP16-NEXT:    vcmp.f64 d0, d2
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it lt
 ; FP16-NEXT:    movlt r1, #0
-; FP16-NEXT:    vcmp.f64 d1, d2
+; FP16-NEXT:    vcmp.f64 d0, d1
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it gt
 ; FP16-NEXT:    movgt.w r1, #-1
-; FP16-NEXT:    vcmp.f64 d1, d1
+; FP16-NEXT:    vcmp.f64 d0, d0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it vs
 ; FP16-NEXT:    movvs r1, #0
-; FP16-NEXT:    vcmp.f64 d1, d0
+; FP16-NEXT:    vcmp.f64 d0, d2
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it lt
 ; FP16-NEXT:    movlt r2, #0
-; FP16-NEXT:    vcmp.f64 d1, d2
+; FP16-NEXT:    vcmp.f64 d0, d1
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it gt
 ; FP16-NEXT:    movgt.w r2, #-1
-; FP16-NEXT:    vcmp.f64 d1, d1
+; FP16-NEXT:    vcmp.f64 d0, d0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it vs
 ; FP16-NEXT:    movvs r2, #0
-; FP16-NEXT:    vcmp.f64 d1, d0
+; FP16-NEXT:    vcmp.f64 d0, d2
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it lt
 ; FP16-NEXT:    movlt.w r3, #-2147483648
-; FP16-NEXT:    vcmp.f64 d1, d2
+; FP16-NEXT:    vcmp.f64 d0, d1
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it gt
 ; FP16-NEXT:    mvngt r3, #-2147483648
-; FP16-NEXT:    vcmp.f64 d1, d1
+; FP16-NEXT:    vcmp.f64 d0, d0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    it vs
 ; FP16-NEXT:    movvs r3, #0

diff  --git a/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll
index 5ba72fcb427c6..aebec8c7dba04 100644
--- a/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll
@@ -370,40 +370,40 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
 ; ARM7:       @ %bb.0:
 ; ARM7-NEXT:    push {r4, r5, r6, r7, r11, lr}
 ; ARM7-NEXT:    vpush {d8, d9}
-; ARM7-NEXT:    mov r5, r0
+; ARM7-NEXT:    mov r6, r0
 ; ARM7-NEXT:    and r0, r3, #1
-; ARM7-NEXT:    mov r4, r1
+; ARM7-NEXT:    mov r5, r1
 ; ARM7-NEXT:    rsb r1, r0, #0
 ; ARM7-NEXT:    mov r0, r2
 ; ARM7-NEXT:    mov r2, #9
 ; ARM7-NEXT:    mov r3, #0
 ; ARM7-NEXT:    bl __moddi3
-; ARM7-NEXT:    mov r6, r0
-; ARM7-NEXT:    and r0, r4, #1
-; ARM7-NEXT:    mov r7, r1
+; ARM7-NEXT:    mov r7, r0
+; ARM7-NEXT:    and r0, r5, #1
+; ARM7-NEXT:    mov r4, r1
 ; ARM7-NEXT:    rsb r1, r0, #0
-; ARM7-NEXT:    mov r0, r5
+; ARM7-NEXT:    mov r0, r6
 ; ARM7-NEXT:    mov r2, #9
 ; ARM7-NEXT:    mov r3, #0
 ; ARM7-NEXT:    bl __moddi3
 ; ARM7-NEXT:    vmov.32 d8[0], r0
 ; ARM7-NEXT:    ldr r0, [sp, #44]
 ; ARM7-NEXT:    ldr r2, [sp, #40]
-; ARM7-NEXT:    mov r4, r1
+; ARM7-NEXT:    mov r5, r1
 ; ARM7-NEXT:    and r0, r0, #1
 ; ARM7-NEXT:    mvn r3, #0
 ; ARM7-NEXT:    rsb r1, r0, #0
-; ARM7-NEXT:    vmov.32 d9[0], r6
+; ARM7-NEXT:    vmov.32 d9[0], r7
 ; ARM7-NEXT:    mov r0, r2
 ; ARM7-NEXT:    mvn r2, #8
 ; ARM7-NEXT:    bl __moddi3
 ; ARM7-NEXT:    vmov.32 d16[0], r0
 ; ARM7-NEXT:    adr r0, .LCPI3_0
-; ARM7-NEXT:    vmov.32 d9[1], r7
+; ARM7-NEXT:    vmov.32 d9[1], r4
 ; ARM7-NEXT:    vld1.64 {d18, d19}, [r0:128]
 ; ARM7-NEXT:    adr r0, .LCPI3_1
 ; ARM7-NEXT:    vmov.32 d16[1], r1
-; ARM7-NEXT:    vmov.32 d8[1], r4
+; ARM7-NEXT:    vmov.32 d8[1], r5
 ; ARM7-NEXT:    vand q8, q8, q9
 ; ARM7-NEXT:    vld1.64 {d20, d21}, [r0:128]
 ; ARM7-NEXT:    adr r0, .LCPI3_2
@@ -446,40 +446,40 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
 ; ARM8:       @ %bb.0:
 ; ARM8-NEXT:    push {r4, r5, r6, r7, r11, lr}
 ; ARM8-NEXT:    vpush {d8, d9}
-; ARM8-NEXT:    mov r5, r0
+; ARM8-NEXT:    mov r6, r0
 ; ARM8-NEXT:    and r0, r3, #1
-; ARM8-NEXT:    mov r4, r1
+; ARM8-NEXT:    mov r5, r1
 ; ARM8-NEXT:    rsb r1, r0, #0
 ; ARM8-NEXT:    mov r0, r2
 ; ARM8-NEXT:    mov r2, #9
 ; ARM8-NEXT:    mov r3, #0
 ; ARM8-NEXT:    bl __moddi3
-; ARM8-NEXT:    mov r6, r0
-; ARM8-NEXT:    and r0, r4, #1
-; ARM8-NEXT:    mov r7, r1
+; ARM8-NEXT:    mov r7, r0
+; ARM8-NEXT:    and r0, r5, #1
+; ARM8-NEXT:    mov r4, r1
 ; ARM8-NEXT:    rsb r1, r0, #0
-; ARM8-NEXT:    mov r0, r5
+; ARM8-NEXT:    mov r0, r6
 ; ARM8-NEXT:    mov r2, #9
 ; ARM8-NEXT:    mov r3, #0
 ; ARM8-NEXT:    bl __moddi3
 ; ARM8-NEXT:    vmov.32 d8[0], r0
 ; ARM8-NEXT:    ldr r0, [sp, #44]
 ; ARM8-NEXT:    ldr r2, [sp, #40]
-; ARM8-NEXT:    mov r4, r1
+; ARM8-NEXT:    mov r5, r1
 ; ARM8-NEXT:    and r0, r0, #1
 ; ARM8-NEXT:    mvn r3, #0
 ; ARM8-NEXT:    rsb r1, r0, #0
-; ARM8-NEXT:    vmov.32 d9[0], r6
+; ARM8-NEXT:    vmov.32 d9[0], r7
 ; ARM8-NEXT:    mov r0, r2
 ; ARM8-NEXT:    mvn r2, #8
 ; ARM8-NEXT:    bl __moddi3
 ; ARM8-NEXT:    vmov.32 d16[0], r0
 ; ARM8-NEXT:    adr r0, .LCPI3_0
-; ARM8-NEXT:    vmov.32 d9[1], r7
+; ARM8-NEXT:    vmov.32 d9[1], r4
 ; ARM8-NEXT:    vld1.64 {d18, d19}, [r0:128]
 ; ARM8-NEXT:    adr r0, .LCPI3_1
 ; ARM8-NEXT:    vmov.32 d16[1], r1
-; ARM8-NEXT:    vmov.32 d8[1], r4
+; ARM8-NEXT:    vmov.32 d8[1], r5
 ; ARM8-NEXT:    vand q8, q8, q9
 ; ARM8-NEXT:    vld1.64 {d20, d21}, [r0:128]
 ; ARM8-NEXT:    adr r0, .LCPI3_2
@@ -522,40 +522,40 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
 ; NEON7:       @ %bb.0:
 ; NEON7-NEXT:    push {r4, r5, r6, r7, r11, lr}
 ; NEON7-NEXT:    vpush {d8, d9}
-; NEON7-NEXT:    mov r5, r0
+; NEON7-NEXT:    mov r6, r0
 ; NEON7-NEXT:    and r0, r3, #1
-; NEON7-NEXT:    mov r4, r1
+; NEON7-NEXT:    mov r5, r1
 ; NEON7-NEXT:    rsb r1, r0, #0
 ; NEON7-NEXT:    mov r0, r2
 ; NEON7-NEXT:    mov r2, #9
 ; NEON7-NEXT:    mov r3, #0
 ; NEON7-NEXT:    bl __moddi3
-; NEON7-NEXT:    mov r6, r0
-; NEON7-NEXT:    and r0, r4, #1
-; NEON7-NEXT:    mov r7, r1
+; NEON7-NEXT:    mov r7, r0
+; NEON7-NEXT:    and r0, r5, #1
+; NEON7-NEXT:    mov r4, r1
 ; NEON7-NEXT:    rsb r1, r0, #0
-; NEON7-NEXT:    mov r0, r5
+; NEON7-NEXT:    mov r0, r6
 ; NEON7-NEXT:    mov r2, #9
 ; NEON7-NEXT:    mov r3, #0
 ; NEON7-NEXT:    bl __moddi3
 ; NEON7-NEXT:    vmov.32 d8[0], r0
 ; NEON7-NEXT:    ldr r0, [sp, #44]
 ; NEON7-NEXT:    ldr r2, [sp, #40]
-; NEON7-NEXT:    mov r4, r1
+; NEON7-NEXT:    mov r5, r1
 ; NEON7-NEXT:    and r0, r0, #1
 ; NEON7-NEXT:    mvn r3, #0
 ; NEON7-NEXT:    rsb r1, r0, #0
-; NEON7-NEXT:    vmov.32 d9[0], r6
+; NEON7-NEXT:    vmov.32 d9[0], r7
 ; NEON7-NEXT:    mov r0, r2
 ; NEON7-NEXT:    mvn r2, #8
 ; NEON7-NEXT:    bl __moddi3
 ; NEON7-NEXT:    vmov.32 d16[0], r0
 ; NEON7-NEXT:    adr r0, .LCPI3_0
-; NEON7-NEXT:    vmov.32 d9[1], r7
+; NEON7-NEXT:    vmov.32 d9[1], r4
 ; NEON7-NEXT:    vld1.64 {d18, d19}, [r0:128]
 ; NEON7-NEXT:    adr r0, .LCPI3_1
 ; NEON7-NEXT:    vmov.32 d16[1], r1
-; NEON7-NEXT:    vmov.32 d8[1], r4
+; NEON7-NEXT:    vmov.32 d8[1], r5
 ; NEON7-NEXT:    vand q8, q8, q9
 ; NEON7-NEXT:    vld1.64 {d20, d21}, [r0:128]
 ; NEON7-NEXT:    adr r0, .LCPI3_2
@@ -598,40 +598,40 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
 ; NEON8:       @ %bb.0:
 ; NEON8-NEXT:    push {r4, r5, r6, r7, r11, lr}
 ; NEON8-NEXT:    vpush {d8, d9}
-; NEON8-NEXT:    mov r5, r0
+; NEON8-NEXT:    mov r6, r0
 ; NEON8-NEXT:    and r0, r3, #1
-; NEON8-NEXT:    mov r4, r1
+; NEON8-NEXT:    mov r5, r1
 ; NEON8-NEXT:    rsb r1, r0, #0
 ; NEON8-NEXT:    mov r0, r2
 ; NEON8-NEXT:    mov r2, #9
 ; NEON8-NEXT:    mov r3, #0
 ; NEON8-NEXT:    bl __moddi3
-; NEON8-NEXT:    mov r6, r0
-; NEON8-NEXT:    and r0, r4, #1
-; NEON8-NEXT:    mov r7, r1
+; NEON8-NEXT:    mov r7, r0
+; NEON8-NEXT:    and r0, r5, #1
+; NEON8-NEXT:    mov r4, r1
 ; NEON8-NEXT:    rsb r1, r0, #0
-; NEON8-NEXT:    mov r0, r5
+; NEON8-NEXT:    mov r0, r6
 ; NEON8-NEXT:    mov r2, #9
 ; NEON8-NEXT:    mov r3, #0
 ; NEON8-NEXT:    bl __moddi3
 ; NEON8-NEXT:    vmov.32 d8[0], r0
 ; NEON8-NEXT:    ldr r0, [sp, #44]
 ; NEON8-NEXT:    ldr r2, [sp, #40]
-; NEON8-NEXT:    mov r4, r1
+; NEON8-NEXT:    mov r5, r1
 ; NEON8-NEXT:    and r0, r0, #1
 ; NEON8-NEXT:    mvn r3, #0
 ; NEON8-NEXT:    rsb r1, r0, #0
-; NEON8-NEXT:    vmov.32 d9[0], r6
+; NEON8-NEXT:    vmov.32 d9[0], r7
 ; NEON8-NEXT:    mov r0, r2
 ; NEON8-NEXT:    mvn r2, #8
 ; NEON8-NEXT:    bl __moddi3
 ; NEON8-NEXT:    vmov.32 d16[0], r0
 ; NEON8-NEXT:    adr r0, .LCPI3_0
-; NEON8-NEXT:    vmov.32 d9[1], r7
+; NEON8-NEXT:    vmov.32 d9[1], r4
 ; NEON8-NEXT:    vld1.64 {d18, d19}, [r0:128]
 ; NEON8-NEXT:    adr r0, .LCPI3_1
 ; NEON8-NEXT:    vmov.32 d16[1], r1
-; NEON8-NEXT:    vmov.32 d8[1], r4
+; NEON8-NEXT:    vmov.32 d8[1], r5
 ; NEON8-NEXT:    vand q8, q8, q9
 ; NEON8-NEXT:    vld1.64 {d20, d21}, [r0:128]
 ; NEON8-NEXT:    adr r0, .LCPI3_2

diff  --git a/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll
index d488599a42b40..afd75940b4593 100644
--- a/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll
@@ -9,7 +9,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; ARMV6-NEXT:    sub sp, sp, #28
 ; ARMV6-NEXT:    ldr r7, [sp, #72]
 ; ARMV6-NEXT:    mov r6, r0
-; ARMV6-NEXT:    str r0, [sp, #8]            @ 4-byte Spill
+; ARMV6-NEXT:    str r0, [sp, #8] @ 4-byte Spill
 ; ARMV6-NEXT:    ldr r4, [sp, #84]
 ; ARMV6-NEXT:    umull r1, r0, r2, r7
 ; ARMV6-NEXT:    mov lr, r7
@@ -17,16 +17,16 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; ARMV6-NEXT:    str r1, [r6]
 ; ARMV6-NEXT:    ldr r6, [sp, #80]
 ; ARMV6-NEXT:    umull r1, r7, r3, r6
-; ARMV6-NEXT:    str r7, [sp, #12]           @ 4-byte Spill
+; ARMV6-NEXT:    str r7, [sp, #12] @ 4-byte Spill
 ; ARMV6-NEXT:    add r1, r5, r1
 ; ARMV6-NEXT:    umull r7, r5, r6, r2
 ; ARMV6-NEXT:    mov r6, lr
-; ARMV6-NEXT:    str r7, [sp, #16]           @ 4-byte Spill
+; ARMV6-NEXT:    str r7, [sp, #16] @ 4-byte Spill
 ; ARMV6-NEXT:    mov r7, #0
 ; ARMV6-NEXT:    adds r1, r5, r1
-; ARMV6-NEXT:    str r1, [sp, #4]            @ 4-byte Spill
+; ARMV6-NEXT:    str r1, [sp, #4] @ 4-byte Spill
 ; ARMV6-NEXT:    adc r1, r7, #0
-; ARMV6-NEXT:    str r1, [sp, #24]           @ 4-byte Spill
+; ARMV6-NEXT:    str r1, [sp, #24] @ 4-byte Spill
 ; ARMV6-NEXT:    ldr r1, [sp, #64]
 ; ARMV6-NEXT:    ldr r7, [sp, #76]
 ; ARMV6-NEXT:    ldr r5, [sp, #64]
@@ -40,15 +40,15 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; ARMV6-NEXT:    adds r12, lr, r12
 ; ARMV6-NEXT:    umull r2, lr, r2, r7
 ; ARMV6-NEXT:    adc r6, r6, #0
-; ARMV6-NEXT:    str r6, [sp, #20]           @ 4-byte Spill
-; ARMV6-NEXT:    ldr r6, [sp, #16]           @ 4-byte Reload
+; ARMV6-NEXT:    str r6, [sp, #20] @ 4-byte Spill
+; ARMV6-NEXT:    ldr r6, [sp, #16] @ 4-byte Reload
 ; ARMV6-NEXT:    adds r11, r11, r6
-; ARMV6-NEXT:    ldr r6, [sp, #4]            @ 4-byte Reload
+; ARMV6-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
 ; ARMV6-NEXT:    adc r6, r12, r6
 ; ARMV6-NEXT:    mov r12, #0
 ; ARMV6-NEXT:    umlal r0, r12, r3, r5
-; ARMV6-NEXT:    ldr r5, [sp, #8]            @ 4-byte Reload
-; ARMV6-NEXT:    str r6, [sp, #16]           @ 4-byte Spill
+; ARMV6-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
+; ARMV6-NEXT:    str r6, [sp, #16] @ 4-byte Spill
 ; ARMV6-NEXT:    ldr r6, [sp, #64]
 ; ARMV6-NEXT:    adds r0, r2, r0
 ; ARMV6-NEXT:    str r0, [r5, #4]
@@ -62,7 +62,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; ARMV6-NEXT:    orrs r12, r6, r4
 ; ARMV6-NEXT:    movne r12, #1
 ; ARMV6-NEXT:    cmp r9, #0
-; ARMV6-NEXT:    ldr r6, [sp, #12]           @ 4-byte Reload
+; ARMV6-NEXT:    ldr r6, [sp, #12] @ 4-byte Reload
 ; ARMV6-NEXT:    movne r9, #1
 ; ARMV6-NEXT:    cmp r8, #0
 ; ARMV6-NEXT:    movne r8, #1
@@ -81,17 +81,17 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; ARMV6-NEXT:    adds r0, r0, r11
 ; ARMV6-NEXT:    str r0, [r5, #8]
 ; ARMV6-NEXT:    and r1, r1, r7
-; ARMV6-NEXT:    ldr r0, [sp, #16]           @ 4-byte Reload
+; ARMV6-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
 ; ARMV6-NEXT:    orr r1, r1, r8
 ; ARMV6-NEXT:    orr r1, r1, r9
 ; ARMV6-NEXT:    adcs r0, r2, r0
 ; ARMV6-NEXT:    str r0, [r5, #12]
 ; ARMV6-NEXT:    and r0, r4, r3
-; ARMV6-NEXT:    ldr r2, [sp, #24]           @ 4-byte Reload
+; ARMV6-NEXT:    ldr r2, [sp, #24] @ 4-byte Reload
 ; ARMV6-NEXT:    orr r0, r0, r10
 ; ARMV6-NEXT:    orr r0, r0, r6
 ; ARMV6-NEXT:    orr r0, r0, r2
-; ARMV6-NEXT:    ldr r2, [sp, #20]           @ 4-byte Reload
+; ARMV6-NEXT:    ldr r2, [sp, #20] @ 4-byte Reload
 ; ARMV6-NEXT:    orr r1, r1, r2
 ; ARMV6-NEXT:    and r2, lr, r12
 ; ARMV6-NEXT:    orr r1, r2, r1
@@ -115,51 +115,51 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; ARMV7-NEXT:    ldr r9, [sp, #76]
 ; ARMV7-NEXT:    umull r4, lr, r5, r1
 ; ARMV7-NEXT:    umull r0, r7, r2, r10
-; ARMV7-NEXT:    str r4, [sp, #24]           @ 4-byte Spill
+; ARMV7-NEXT:    str r4, [sp, #24] @ 4-byte Spill
 ; ARMV7-NEXT:    ldr r4, [sp, #88]
 ; ARMV7-NEXT:    umull r1, r6, r1, r10
-; ARMV7-NEXT:    str r0, [sp, #32]           @ 4-byte Spill
+; ARMV7-NEXT:    str r0, [sp, #32] @ 4-byte Spill
 ; ARMV7-NEXT:    umull r11, r0, r2, r5
-; ARMV7-NEXT:    str r6, [sp, #20]           @ 4-byte Spill
-; ARMV7-NEXT:    str r1, [sp, #28]           @ 4-byte Spill
+; ARMV7-NEXT:    str r6, [sp, #20] @ 4-byte Spill
+; ARMV7-NEXT:    str r1, [sp, #28] @ 4-byte Spill
 ; ARMV7-NEXT:    umull r6, r12, r3, r4
 ; ARMV7-NEXT:    ldr r1, [sp, #92]
-; ARMV7-NEXT:    str r0, [sp, #8]            @ 4-byte Spill
+; ARMV7-NEXT:    str r0, [sp, #8] @ 4-byte Spill
 ; ARMV7-NEXT:    mov r0, #0
 ; ARMV7-NEXT:    umlal r7, r0, r3, r10
-; ARMV7-NEXT:    str r6, [sp, #16]           @ 4-byte Spill
+; ARMV7-NEXT:    str r6, [sp, #16] @ 4-byte Spill
 ; ARMV7-NEXT:    umull r6, r1, r1, r2
 ; ARMV7-NEXT:    umull r2, r4, r4, r2
-; ARMV7-NEXT:    str r6, [sp, #4]            @ 4-byte Spill
-; ARMV7-NEXT:    str r2, [sp, #12]           @ 4-byte Spill
+; ARMV7-NEXT:    str r6, [sp, #4] @ 4-byte Spill
+; ARMV7-NEXT:    str r2, [sp, #12] @ 4-byte Spill
 ; ARMV7-NEXT:    adds r2, r11, r7
-; ARMV7-NEXT:    ldr r7, [sp, #8]            @ 4-byte Reload
+; ARMV7-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
 ; ARMV7-NEXT:    mov r11, #0
-; ARMV7-NEXT:    str r4, [sp]                @ 4-byte Spill
+; ARMV7-NEXT:    str r4, [sp] @ 4-byte Spill
 ; ARMV7-NEXT:    umull r6, r4, r9, r10
 ; ARMV7-NEXT:    adcs r9, r0, r7
-; ARMV7-NEXT:    ldr r0, [sp, #32]           @ 4-byte Reload
+; ARMV7-NEXT:    ldr r0, [sp, #32] @ 4-byte Reload
 ; ARMV7-NEXT:    adc r10, r11, #0
 ; ARMV7-NEXT:    stm r8, {r0, r2}
-; ARMV7-NEXT:    ldr r0, [sp, #24]           @ 4-byte Reload
+; ARMV7-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
 ; ARMV7-NEXT:    umlal r9, r10, r3, r5
-; ARMV7-NEXT:    ldr r2, [sp, #20]           @ 4-byte Reload
+; ARMV7-NEXT:    ldr r2, [sp, #20] @ 4-byte Reload
 ; ARMV7-NEXT:    add r0, r6, r0
-; ARMV7-NEXT:    adds r2, r2, r0
-; ARMV7-NEXT:    ldr r6, [sp, #4]            @ 4-byte Reload
-; ARMV7-NEXT:    adc r0, r11, #0
-; ARMV7-NEXT:    str r0, [sp, #32]           @ 4-byte Spill
-; ARMV7-NEXT:    ldr r0, [sp, #16]           @ 4-byte Reload
-; ARMV7-NEXT:    ldr r7, [sp, #28]           @ 4-byte Reload
-; ARMV7-NEXT:    add r0, r6, r0
-; ARMV7-NEXT:    ldr r6, [sp]                @ 4-byte Reload
-; ARMV7-NEXT:    adds r0, r6, r0
-; ARMV7-NEXT:    ldr r6, [sp, #12]           @ 4-byte Reload
+; ARMV7-NEXT:    adds r0, r2, r0
+; ARMV7-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
+; ARMV7-NEXT:    adc r2, r11, #0
+; ARMV7-NEXT:    str r2, [sp, #32] @ 4-byte Spill
+; ARMV7-NEXT:    ldr r2, [sp, #16] @ 4-byte Reload
+; ARMV7-NEXT:    ldr r7, [sp, #28] @ 4-byte Reload
+; ARMV7-NEXT:    add r2, r6, r2
+; ARMV7-NEXT:    ldr r6, [sp] @ 4-byte Reload
+; ARMV7-NEXT:    adds r2, r6, r2
+; ARMV7-NEXT:    ldr r6, [sp, #12] @ 4-byte Reload
 ; ARMV7-NEXT:    adc r11, r11, #0
 ; ARMV7-NEXT:    adds r7, r7, r6
 ; ARMV7-NEXT:    ldr r6, [sp, #92]
-; ARMV7-NEXT:    adc r0, r2, r0
-; ARMV7-NEXT:    str r0, [sp, #28]           @ 4-byte Spill
+; ARMV7-NEXT:    adc r0, r0, r2
+; ARMV7-NEXT:    str r0, [sp, #28] @ 4-byte Spill
 ; ARMV7-NEXT:    ldr r0, [sp, #92]
 ; ARMV7-NEXT:    cmp r3, #0
 ; ARMV7-NEXT:    movwne r3, #1
@@ -195,11 +195,11 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; ARMV7-NEXT:    adds r7, r9, r7
 ; ARMV7-NEXT:    str r7, [r8, #8]
 ; ARMV7-NEXT:    and r2, r2, r3
-; ARMV7-NEXT:    ldr r7, [sp, #28]           @ 4-byte Reload
+; ARMV7-NEXT:    ldr r7, [sp, #28] @ 4-byte Reload
 ; ARMV7-NEXT:    orr r0, r0, r11
 ; ARMV7-NEXT:    adcs r7, r10, r7
 ; ARMV7-NEXT:    str r7, [r8, #12]
-; ARMV7-NEXT:    ldr r7, [sp, #32]           @ 4-byte Reload
+; ARMV7-NEXT:    ldr r7, [sp, #32] @ 4-byte Reload
 ; ARMV7-NEXT:    orr r1, r1, r7
 ; ARMV7-NEXT:    orr r1, r2, r1
 ; ARMV7-NEXT:    orr r0, r1, r0

diff  --git a/llvm/test/CodeGen/Hexagon/reg-scavengebug-2.ll b/llvm/test/CodeGen/Hexagon/reg-scavengebug-2.ll
index 3eb0c5e74725e..4b2dd36e6c14f 100644
--- a/llvm/test/CodeGen/Hexagon/reg-scavengebug-2.ll
+++ b/llvm/test/CodeGen/Hexagon/reg-scavengebug-2.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O3 -march=hexagon < %s | FileCheck %s
-; CHECK: v{{[0-9]+}} = vmem(r{{[0-9]+}}+#0)
+; CHECK: v{{[0-9]+}}.cur = vmem(r{{[0-9]+}}+#0)
 
 target triple = "hexagon"
 

diff  --git a/llvm/test/CodeGen/Mips/cconv/vector.ll b/llvm/test/CodeGen/Mips/cconv/vector.ll
index 2187a9b9bf6a0..b8c474a7c90a0 100644
--- a/llvm/test/CodeGen/Mips/cconv/vector.ll
+++ b/llvm/test/CodeGen/Mips/cconv/vector.ll
@@ -1027,85 +1027,85 @@ define <16 x i8> @i8_16(<16 x i8> %a, <16 x i8> %b) {
 ; MIPS64-NEXT:    sll $10, $10, 0
 ; MIPS64-NEXT:    addu $9, $10, $9
 ; MIPS64-NEXT:    addu $2, $8, $2
-; MIPS64-NEXT:    sll $1, $1, 8
+; MIPS64-NEXT:    sll $8, $1, 8
 ; MIPS64-NEXT:    andi $3, $3, 255
-; MIPS64-NEXT:    sll $8, $12, 8
+; MIPS64-NEXT:    sll $1, $12, 8
 ; MIPS64-NEXT:    sll $10, $11, 0
 ; MIPS64-NEXT:    dsrl $11, $5, 32
 ; MIPS64-NEXT:    sll $11, $11, 0
 ; MIPS64-NEXT:    addu $10, $11, $10
 ; MIPS64-NEXT:    andi $10, $10, 255
-; MIPS64-NEXT:    or $8, $10, $8
-; MIPS64-NEXT:    sll $10, $6, 0
-; MIPS64-NEXT:    or $1, $3, $1
+; MIPS64-NEXT:    or $10, $10, $1
+; MIPS64-NEXT:    sll $1, $6, 0
+; MIPS64-NEXT:    or $8, $3, $8
 ; MIPS64-NEXT:    sll $2, $2, 8
-; MIPS64-NEXT:    andi $3, $9, 255
-; MIPS64-NEXT:    dsrl $9, $6, 40
-; MIPS64-NEXT:    srl $11, $10, 24
+; MIPS64-NEXT:    andi $9, $9, 255
+; MIPS64-NEXT:    dsrl $11, $6, 40
+; MIPS64-NEXT:    srl $3, $1, 24
 ; MIPS64-NEXT:    sll $12, $4, 0
 ; MIPS64-NEXT:    srl $13, $12, 24
-; MIPS64-NEXT:    srl $14, $10, 16
+; MIPS64-NEXT:    srl $14, $1, 16
 ; MIPS64-NEXT:    srl $15, $12, 16
-; MIPS64-NEXT:    andi $8, $8, 65535
+; MIPS64-NEXT:    andi $10, $10, 65535
 ; MIPS64-NEXT:    addu $14, $15, $14
-; MIPS64-NEXT:    addu $11, $13, $11
-; MIPS64-NEXT:    sll $7, $7, 0
-; MIPS64-NEXT:    or $2, $3, $2
-; MIPS64-NEXT:    sll $1, $1, 16
-; MIPS64-NEXT:    sll $3, $9, 0
+; MIPS64-NEXT:    addu $13, $13, $3
+; MIPS64-NEXT:    sll $3, $7, 0
+; MIPS64-NEXT:    or $2, $9, $2
+; MIPS64-NEXT:    sll $7, $8, 16
+; MIPS64-NEXT:    sll $8, $11, 0
 ; MIPS64-NEXT:    dsrl $9, $4, 40
 ; MIPS64-NEXT:    sll $9, $9, 0
-; MIPS64-NEXT:    addu $3, $9, $3
+; MIPS64-NEXT:    addu $8, $9, $8
 ; MIPS64-NEXT:    dsrl $6, $6, 32
-; MIPS64-NEXT:    srl $9, $7, 24
+; MIPS64-NEXT:    srl $9, $3, 24
 ; MIPS64-NEXT:    sll $5, $5, 0
-; MIPS64-NEXT:    srl $13, $5, 24
-; MIPS64-NEXT:    or $1, $8, $1
-; MIPS64-NEXT:    addu $8, $13, $9
-; MIPS64-NEXT:    sll $9, $11, 8
+; MIPS64-NEXT:    srl $11, $5, 24
+; MIPS64-NEXT:    or $7, $10, $7
+; MIPS64-NEXT:    addu $9, $11, $9
+; MIPS64-NEXT:    sll $10, $13, 8
 ; MIPS64-NEXT:    andi $11, $14, 255
 ; MIPS64-NEXT:    sll $2, $2, 16
-; MIPS64-NEXT:    sll $3, $3, 8
+; MIPS64-NEXT:    sll $8, $8, 8
 ; MIPS64-NEXT:    sll $6, $6, 0
 ; MIPS64-NEXT:    dsrl $4, $4, 32
 ; MIPS64-NEXT:    sll $4, $4, 0
 ; MIPS64-NEXT:    addu $4, $4, $6
 ; MIPS64-NEXT:    andi $4, $4, 255
-; MIPS64-NEXT:    or $3, $4, $3
-; MIPS64-NEXT:    andi $3, $3, 65535
-; MIPS64-NEXT:    or $2, $3, $2
-; MIPS64-NEXT:    or $3, $11, $9
-; MIPS64-NEXT:    addu $4, $12, $10
-; MIPS64-NEXT:    sll $6, $8, 8
-; MIPS64-NEXT:    srl $8, $7, 16
-; MIPS64-NEXT:    srl $9, $5, 16
-; MIPS64-NEXT:    addu $8, $9, $8
-; MIPS64-NEXT:    andi $8, $8, 255
-; MIPS64-NEXT:    or $6, $8, $6
-; MIPS64-NEXT:    addu $8, $5, $7
+; MIPS64-NEXT:    or $4, $4, $8
+; MIPS64-NEXT:    andi $4, $4, 65535
+; MIPS64-NEXT:    or $2, $4, $2
+; MIPS64-NEXT:    or $4, $11, $10
+; MIPS64-NEXT:    addu $6, $12, $1
+; MIPS64-NEXT:    sll $8, $9, 8
+; MIPS64-NEXT:    srl $9, $3, 16
+; MIPS64-NEXT:    srl $10, $5, 16
+; MIPS64-NEXT:    addu $9, $10, $9
+; MIPS64-NEXT:    andi $9, $9, 255
+; MIPS64-NEXT:    or $8, $9, $8
+; MIPS64-NEXT:    addu $9, $5, $3
 ; MIPS64-NEXT:    dsll $2, $2, 32
-; MIPS64-NEXT:    sll $3, $3, 16
-; MIPS64-NEXT:    andi $4, $4, 255
-; MIPS64-NEXT:    srl $9, $10, 8
+; MIPS64-NEXT:    sll $4, $4, 16
+; MIPS64-NEXT:    andi $6, $6, 255
+; MIPS64-NEXT:    srl $1, $1, 8
 ; MIPS64-NEXT:    srl $10, $12, 8
-; MIPS64-NEXT:    addu $9, $10, $9
-; MIPS64-NEXT:    sll $9, $9, 8
-; MIPS64-NEXT:    or $4, $4, $9
-; MIPS64-NEXT:    andi $4, $4, 65535
-; MIPS64-NEXT:    or $3, $4, $3
-; MIPS64-NEXT:    dsll $3, $3, 32
-; MIPS64-NEXT:    dsrl $3, $3, 32
-; MIPS64-NEXT:    or $2, $3, $2
+; MIPS64-NEXT:    addu $1, $10, $1
+; MIPS64-NEXT:    sll $1, $1, 8
+; MIPS64-NEXT:    or $1, $6, $1
+; MIPS64-NEXT:    andi $1, $1, 65535
+; MIPS64-NEXT:    or $1, $1, $4
 ; MIPS64-NEXT:    dsll $1, $1, 32
-; MIPS64-NEXT:    sll $3, $6, 16
-; MIPS64-NEXT:    andi $4, $8, 255
-; MIPS64-NEXT:    srl $6, $7, 8
+; MIPS64-NEXT:    dsrl $1, $1, 32
+; MIPS64-NEXT:    or $2, $1, $2
+; MIPS64-NEXT:    dsll $1, $7, 32
+; MIPS64-NEXT:    sll $4, $8, 16
+; MIPS64-NEXT:    andi $6, $9, 255
+; MIPS64-NEXT:    srl $3, $3, 8
 ; MIPS64-NEXT:    srl $5, $5, 8
-; MIPS64-NEXT:    addu $5, $5, $6
-; MIPS64-NEXT:    sll $5, $5, 8
-; MIPS64-NEXT:    or $4, $4, $5
-; MIPS64-NEXT:    andi $4, $4, 65535
-; MIPS64-NEXT:    or $3, $4, $3
+; MIPS64-NEXT:    addu $3, $5, $3
+; MIPS64-NEXT:    sll $3, $3, 8
+; MIPS64-NEXT:    or $3, $6, $3
+; MIPS64-NEXT:    andi $3, $3, 65535
+; MIPS64-NEXT:    or $3, $3, $4
 ; MIPS64-NEXT:    dsll $3, $3, 32
 ; MIPS64-NEXT:    dsrl $3, $3, 32
 ; MIPS64-NEXT:    or $3, $3, $1

diff  --git a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll
index 4ba8d45b50b5b..2c1483f9bedc8 100644
--- a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll
@@ -647,23 +647,23 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
 ;
 ; P8BE-LABEL: combine_srem_sdiv:
 ; P8BE:       # %bb.0:
-; P8BE-NEXT:    mfvsrd r4, v2
-; P8BE-NEXT:    lis r3, -21386
+; P8BE-NEXT:    mfvsrd r5, v2
+; P8BE-NEXT:    lis r4, -21386
 ; P8BE-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; P8BE-NEXT:    addis r30, r2, .LCPI2_0 at toc@ha
-; P8BE-NEXT:    ori r3, r3, 37253
-; P8BE-NEXT:    clrldi r5, r4, 48
-; P8BE-NEXT:    rldicl r6, r4, 48, 48
-; P8BE-NEXT:    rldicl r7, r4, 32, 48
-; P8BE-NEXT:    extsh r8, r5
+; P8BE-NEXT:    ori r4, r4, 37253
+; P8BE-NEXT:    clrldi r3, r5, 48
+; P8BE-NEXT:    rldicl r6, r5, 48, 48
+; P8BE-NEXT:    rldicl r7, r5, 32, 48
+; P8BE-NEXT:    extsh r8, r3
 ; P8BE-NEXT:    extsh r9, r6
 ; P8BE-NEXT:    extsh r10, r7
-; P8BE-NEXT:    mulhw r11, r8, r3
-; P8BE-NEXT:    mulhw r12, r9, r3
-; P8BE-NEXT:    rldicl r4, r4, 16, 48
-; P8BE-NEXT:    mulhw r0, r10, r3
-; P8BE-NEXT:    extsh r4, r4
-; P8BE-NEXT:    mulhw r3, r4, r3
+; P8BE-NEXT:    mulhw r11, r8, r4
+; P8BE-NEXT:    mulhw r12, r9, r4
+; P8BE-NEXT:    rldicl r5, r5, 16, 48
+; P8BE-NEXT:    mulhw r0, r10, r4
+; P8BE-NEXT:    extsh r5, r5
+; P8BE-NEXT:    mulhw r4, r5, r4
 ; P8BE-NEXT:    add r8, r11, r8
 ; P8BE-NEXT:    add r9, r12, r9
 ; P8BE-NEXT:    srwi r11, r8, 31
@@ -674,7 +674,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
 ; P8BE-NEXT:    srawi r12, r9, 6
 ; P8BE-NEXT:    srwi r9, r9, 31
 ; P8BE-NEXT:    add r8, r8, r11
-; P8BE-NEXT:    add r3, r3, r4
+; P8BE-NEXT:    add r4, r4, r5
 ; P8BE-NEXT:    lxvw4x v2, 0, r0
 ; P8BE-NEXT:    srawi r11, r10, 6
 ; P8BE-NEXT:    srwi r10, r10, 31
@@ -682,25 +682,25 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
 ; P8BE-NEXT:    mtvsrwz v3, r8
 ; P8BE-NEXT:    mulli r12, r8, 95
 ; P8BE-NEXT:    add r10, r11, r10
-; P8BE-NEXT:    srwi r11, r3, 31
+; P8BE-NEXT:    srwi r11, r4, 31
 ; P8BE-NEXT:    mtvsrwz v4, r9
-; P8BE-NEXT:    srawi r3, r3, 6
+; P8BE-NEXT:    srawi r4, r4, 6
 ; P8BE-NEXT:    mulli r8, r9, 95
 ; P8BE-NEXT:    mtvsrwz v5, r10
-; P8BE-NEXT:    add r3, r3, r11
+; P8BE-NEXT:    add r4, r4, r11
 ; P8BE-NEXT:    mulli r9, r10, 95
 ; P8BE-NEXT:    vperm v3, v4, v3, v2
-; P8BE-NEXT:    mulli r10, r3, 95
-; P8BE-NEXT:    sub r5, r5, r12
+; P8BE-NEXT:    mulli r10, r4, 95
+; P8BE-NEXT:    sub r3, r3, r12
 ; P8BE-NEXT:    sub r6, r6, r8
-; P8BE-NEXT:    mtvsrwz v4, r5
+; P8BE-NEXT:    mtvsrwz v4, r3
 ; P8BE-NEXT:    mtvsrwz v0, r6
-; P8BE-NEXT:    sub r5, r7, r9
-; P8BE-NEXT:    sub r4, r4, r10
-; P8BE-NEXT:    mtvsrwz v1, r5
-; P8BE-NEXT:    mtvsrwz v6, r4
+; P8BE-NEXT:    sub r3, r7, r9
+; P8BE-NEXT:    sub r5, r5, r10
+; P8BE-NEXT:    mtvsrwz v1, r3
+; P8BE-NEXT:    mtvsrwz v6, r5
 ; P8BE-NEXT:    vperm v4, v0, v4, v2
-; P8BE-NEXT:    mtvsrwz v0, r3
+; P8BE-NEXT:    mtvsrwz v0, r4
 ; P8BE-NEXT:    vperm v1, v6, v1, v2
 ; P8BE-NEXT:    vperm v2, v0, v5, v2
 ; P8BE-NEXT:    vmrghw v4, v1, v4

diff  --git a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
index d94101e6b8ca4..d7217372f8bac 100644
--- a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
@@ -625,21 +625,21 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
 ;
 ; P8BE-LABEL: combine_urem_udiv:
 ; P8BE:       # %bb.0:
-; P8BE-NEXT:    mfvsrd r4, v2
-; P8BE-NEXT:    lis r3, 22765
-; P8BE-NEXT:    ori r3, r3, 8969
-; P8BE-NEXT:    clrldi r5, r4, 48
-; P8BE-NEXT:    rldicl r6, r4, 48, 48
-; P8BE-NEXT:    clrlwi r8, r5, 16
+; P8BE-NEXT:    mfvsrd r5, v2
+; P8BE-NEXT:    lis r4, 22765
+; P8BE-NEXT:    ori r4, r4, 8969
+; P8BE-NEXT:    clrldi r3, r5, 48
+; P8BE-NEXT:    rldicl r6, r5, 48, 48
+; P8BE-NEXT:    clrlwi r8, r3, 16
 ; P8BE-NEXT:    clrlwi r9, r6, 16
-; P8BE-NEXT:    rldicl r7, r4, 32, 48
-; P8BE-NEXT:    rldicl r4, r4, 16, 48
-; P8BE-NEXT:    mulhwu r10, r8, r3
-; P8BE-NEXT:    mulhwu r12, r9, r3
+; P8BE-NEXT:    rldicl r7, r5, 32, 48
+; P8BE-NEXT:    rldicl r5, r5, 16, 48
+; P8BE-NEXT:    mulhwu r10, r8, r4
+; P8BE-NEXT:    mulhwu r12, r9, r4
 ; P8BE-NEXT:    clrlwi r11, r7, 16
-; P8BE-NEXT:    clrlwi r4, r4, 16
-; P8BE-NEXT:    mulhwu r0, r11, r3
-; P8BE-NEXT:    mulhwu r3, r4, r3
+; P8BE-NEXT:    clrlwi r5, r5, 16
+; P8BE-NEXT:    mulhwu r0, r11, r4
+; P8BE-NEXT:    mulhwu r4, r5, r4
 ; P8BE-NEXT:    sub r8, r8, r10
 ; P8BE-NEXT:    sub r9, r9, r12
 ; P8BE-NEXT:    srwi r8, r8, 1
@@ -647,7 +647,7 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
 ; P8BE-NEXT:    sub r11, r11, r0
 ; P8BE-NEXT:    add r8, r8, r10
 ; P8BE-NEXT:    add r9, r9, r12
-; P8BE-NEXT:    sub r12, r4, r3
+; P8BE-NEXT:    sub r12, r5, r4
 ; P8BE-NEXT:    addis r10, r2, .LCPI2_0 at toc@ha
 ; P8BE-NEXT:    srwi r11, r11, 1
 ; P8BE-NEXT:    srwi r8, r8, 6
@@ -656,27 +656,27 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
 ; P8BE-NEXT:    addi r10, r10, .LCPI2_0 at toc@l
 ; P8BE-NEXT:    add r11, r11, r0
 ; P8BE-NEXT:    mulli r0, r8, 95
-; P8BE-NEXT:    add r3, r12, r3
+; P8BE-NEXT:    add r4, r12, r4
 ; P8BE-NEXT:    mtvsrwz v3, r8
 ; P8BE-NEXT:    lxvw4x v2, 0, r10
 ; P8BE-NEXT:    srwi r10, r11, 6
 ; P8BE-NEXT:    mulli r8, r9, 95
-; P8BE-NEXT:    srwi r3, r3, 6
+; P8BE-NEXT:    srwi r4, r4, 6
 ; P8BE-NEXT:    mtvsrwz v4, r9
 ; P8BE-NEXT:    mulli r9, r10, 95
 ; P8BE-NEXT:    mtvsrwz v5, r10
-; P8BE-NEXT:    mulli r10, r3, 95
+; P8BE-NEXT:    mulli r10, r4, 95
 ; P8BE-NEXT:    vperm v3, v4, v3, v2
-; P8BE-NEXT:    sub r5, r5, r0
+; P8BE-NEXT:    sub r3, r3, r0
 ; P8BE-NEXT:    sub r6, r6, r8
-; P8BE-NEXT:    mtvsrwz v4, r5
+; P8BE-NEXT:    mtvsrwz v4, r3
 ; P8BE-NEXT:    mtvsrwz v0, r6
-; P8BE-NEXT:    sub r5, r7, r9
-; P8BE-NEXT:    sub r4, r4, r10
-; P8BE-NEXT:    mtvsrwz v1, r5
-; P8BE-NEXT:    mtvsrwz v6, r4
+; P8BE-NEXT:    sub r3, r7, r9
+; P8BE-NEXT:    sub r5, r5, r10
+; P8BE-NEXT:    mtvsrwz v1, r3
+; P8BE-NEXT:    mtvsrwz v6, r5
 ; P8BE-NEXT:    vperm v4, v0, v4, v2
-; P8BE-NEXT:    mtvsrwz v0, r3
+; P8BE-NEXT:    mtvsrwz v0, r4
 ; P8BE-NEXT:    vperm v1, v6, v1, v2
 ; P8BE-NEXT:    vperm v2, v0, v5, v2
 ; P8BE-NEXT:    vmrghw v4, v1, v4

diff  --git a/llvm/test/CodeGen/RISCV/rv32zbp.ll b/llvm/test/CodeGen/RISCV/rv32zbp.ll
index 1717526a608cd..32ef963abab8e 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbp.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbp.ll
@@ -2792,56 +2792,56 @@ define i64 @bitreverse_bswap_i64(i64 %a) {
 ; RV32I-NEXT:    addi a6, a2, -256
 ; RV32I-NEXT:    and a3, a3, a6
 ; RV32I-NEXT:    srli a4, a1, 24
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a4, a1, 8
+; RV32I-NEXT:    or a4, a3, a4
+; RV32I-NEXT:    slli a5, a1, 8
 ; RV32I-NEXT:    lui a7, 4080
-; RV32I-NEXT:    and a4, a4, a7
+; RV32I-NEXT:    and a5, a5, a7
 ; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a4
-; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    srli a3, a1, 4
-; RV32I-NEXT:    lui a4, 61681
-; RV32I-NEXT:    addi t0, a4, -241
-; RV32I-NEXT:    and a3, a3, t0
+; RV32I-NEXT:    srli a4, a1, 4
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi t0, a5, -241
+; RV32I-NEXT:    and a4, a4, t0
 ; RV32I-NEXT:    and a1, a1, t0
 ; RV32I-NEXT:    slli a1, a1, 4
-; RV32I-NEXT:    or a1, a3, a1
-; RV32I-NEXT:    srli a3, a1, 2
+; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    srli a4, a1, 2
 ; RV32I-NEXT:    lui a2, 209715
 ; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a3, a3, a2
+; RV32I-NEXT:    and a4, a4, a2
 ; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a1, a1, 2
-; RV32I-NEXT:    or a1, a3, a1
-; RV32I-NEXT:    srli a3, a1, 1
-; RV32I-NEXT:    lui a5, 349525
-; RV32I-NEXT:    addi a5, a5, 1365
-; RV32I-NEXT:    and a3, a3, a5
-; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    srli a4, a1, 1
+; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    addi a3, a3, 1365
+; RV32I-NEXT:    and a4, a4, a3
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    or a1, a3, a1
-; RV32I-NEXT:    srli a3, a0, 8
-; RV32I-NEXT:    and a3, a3, a6
-; RV32I-NEXT:    srli a4, a0, 24
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a4, a0, 8
-; RV32I-NEXT:    and a4, a4, a7
+; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    and a4, a4, a6
+; RV32I-NEXT:    srli a5, a0, 24
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a5, a0, 8
+; RV32I-NEXT:    and a5, a5, a7
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, a5
 ; RV32I-NEXT:    or a0, a0, a4
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    srli a3, a0, 4
-; RV32I-NEXT:    and a3, a3, t0
+; RV32I-NEXT:    srli a4, a0, 4
+; RV32I-NEXT:    and a4, a4, t0
 ; RV32I-NEXT:    and a0, a0, t0
 ; RV32I-NEXT:    slli a0, a0, 4
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    srli a3, a0, 2
-; RV32I-NEXT:    and a3, a3, a2
+; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    srli a4, a0, 2
+; RV32I-NEXT:    and a4, a4, a2
 ; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    slli a0, a0, 2
-; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    or a0, a4, a0
 ; RV32I-NEXT:    srli a2, a0, 1
-; RV32I-NEXT:    and a2, a2, a5
-; RV32I-NEXT:    and a0, a0, a5
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    and a0, a0, a3
 ; RV32I-NEXT:    slli a0, a0, 1
 ; RV32I-NEXT:    or a0, a2, a0
 ; RV32I-NEXT:    srli a2, a0, 8

diff  --git a/llvm/test/CodeGen/RISCV/rv64zbp.ll b/llvm/test/CodeGen/RISCV/rv64zbp.ll
index fba8371583a5c..96121858ff53c 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbp.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbp.ll
@@ -3182,14 +3182,14 @@ define i32 @bitreverse_bswap_i32(i32 %a) {
 define i64 @bitreverse_bswap_i64(i64 %a) {
 ; RV64I-LABEL: bitreverse_bswap_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    srli a2, a0, 24
+; RV64I-NEXT:    srli a1, a0, 24
 ; RV64I-NEXT:    lui a6, 4080
-; RV64I-NEXT:    and a3, a2, a6
-; RV64I-NEXT:    srli a4, a0, 8
+; RV64I-NEXT:    and a1, a1, a6
+; RV64I-NEXT:    srli a3, a0, 8
 ; RV64I-NEXT:    addi a5, zero, 255
 ; RV64I-NEXT:    slli a7, a5, 24
-; RV64I-NEXT:    and a4, a4, a7
-; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    and a3, a3, a7
+; RV64I-NEXT:    or a3, a3, a1
 ; RV64I-NEXT:    srli a4, a0, 40
 ; RV64I-NEXT:    lui a1, 16
 ; RV64I-NEXT:    addiw a1, a1, -256
@@ -3197,9 +3197,9 @@ define i64 @bitreverse_bswap_i64(i64 %a) {
 ; RV64I-NEXT:    srli a2, a0, 56
 ; RV64I-NEXT:    or a2, a4, a2
 ; RV64I-NEXT:    or a2, a3, a2
-; RV64I-NEXT:    slli a3, a0, 8
+; RV64I-NEXT:    slli a4, a0, 8
 ; RV64I-NEXT:    slli t0, a5, 32
-; RV64I-NEXT:    and a3, a3, t0
+; RV64I-NEXT:    and a3, a4, t0
 ; RV64I-NEXT:    slli a4, a0, 24
 ; RV64I-NEXT:    slli t1, a5, 40
 ; RV64I-NEXT:    and a4, a4, t1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
index a3180f0b4e317..bc51bf8340b90 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
@@ -1325,30 +1325,30 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX1-RV64-NEXT:    vsrl.vx v27, v26, t0
 ; LMULMAX1-RV64-NEXT:    addi t1, zero, 40
 ; LMULMAX1-RV64-NEXT:    vsrl.vx v28, v26, t1
-; LMULMAX1-RV64-NEXT:    lui a4, 16
-; LMULMAX1-RV64-NEXT:    addiw t2, a4, -256
+; LMULMAX1-RV64-NEXT:    lui a1, 16
+; LMULMAX1-RV64-NEXT:    addiw t2, a1, -256
 ; LMULMAX1-RV64-NEXT:    vand.vx v28, v28, t2
 ; LMULMAX1-RV64-NEXT:    vor.vv v27, v28, v27
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v28, v26, 24
 ; LMULMAX1-RV64-NEXT:    lui a7, 4080
 ; LMULMAX1-RV64-NEXT:    vand.vx v28, v28, a7
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v29, v26, 8
-; LMULMAX1-RV64-NEXT:    addi a1, zero, 255
-; LMULMAX1-RV64-NEXT:    slli t4, a1, 24
+; LMULMAX1-RV64-NEXT:    addi a3, zero, 255
+; LMULMAX1-RV64-NEXT:    slli t4, a3, 24
 ; LMULMAX1-RV64-NEXT:    vand.vx v29, v29, t4
 ; LMULMAX1-RV64-NEXT:    vor.vv v28, v29, v28
 ; LMULMAX1-RV64-NEXT:    vor.vv v27, v28, v27
 ; LMULMAX1-RV64-NEXT:    vsll.vi v28, v26, 8
-; LMULMAX1-RV64-NEXT:    slli a2, a1, 32
-; LMULMAX1-RV64-NEXT:    vand.vx v28, v28, a2
+; LMULMAX1-RV64-NEXT:    slli a5, a3, 32
+; LMULMAX1-RV64-NEXT:    vand.vx v28, v28, a5
 ; LMULMAX1-RV64-NEXT:    vsll.vi v29, v26, 24
-; LMULMAX1-RV64-NEXT:    slli a3, a1, 40
-; LMULMAX1-RV64-NEXT:    vand.vx v29, v29, a3
+; LMULMAX1-RV64-NEXT:    slli a2, a3, 40
+; LMULMAX1-RV64-NEXT:    vand.vx v29, v29, a2
 ; LMULMAX1-RV64-NEXT:    vor.vv v28, v29, v28
 ; LMULMAX1-RV64-NEXT:    vsll.vx v29, v26, t0
 ; LMULMAX1-RV64-NEXT:    vsll.vx v26, v26, t1
-; LMULMAX1-RV64-NEXT:    slli a1, a1, 48
-; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a1
+; LMULMAX1-RV64-NEXT:    slli a3, a3, 48
+; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a3
 ; LMULMAX1-RV64-NEXT:    vor.vv v26, v29, v26
 ; LMULMAX1-RV64-NEXT:    vor.vv v26, v26, v28
 ; LMULMAX1-RV64-NEXT:    vor.vv v26, v26, v27
@@ -1379,16 +1379,16 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX1-RV64-NEXT:    vsll.vi v26, v26, 2
 ; LMULMAX1-RV64-NEXT:    vor.vv v26, v27, v26
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v26, 1
-; LMULMAX1-RV64-NEXT:    lui a5, 21845
-; LMULMAX1-RV64-NEXT:    addiw a5, a5, 1365
-; LMULMAX1-RV64-NEXT:    slli a5, a5, 12
-; LMULMAX1-RV64-NEXT:    addi a5, a5, 1365
-; LMULMAX1-RV64-NEXT:    slli a5, a5, 12
-; LMULMAX1-RV64-NEXT:    addi a5, a5, 1365
-; LMULMAX1-RV64-NEXT:    slli a5, a5, 12
-; LMULMAX1-RV64-NEXT:    addi a5, a5, 1365
-; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, a5
-; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a5
+; LMULMAX1-RV64-NEXT:    lui a1, 21845
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, 1365
+; LMULMAX1-RV64-NEXT:    slli a1, a1, 12
+; LMULMAX1-RV64-NEXT:    addi a1, a1, 1365
+; LMULMAX1-RV64-NEXT:    slli a1, a1, 12
+; LMULMAX1-RV64-NEXT:    addi a1, a1, 1365
+; LMULMAX1-RV64-NEXT:    slli a1, a1, 12
+; LMULMAX1-RV64-NEXT:    addi a1, a1, 1365
+; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, a1
+; LMULMAX1-RV64-NEXT:    vand.vx v26, v26, a1
 ; LMULMAX1-RV64-NEXT:    vadd.vv v26, v26, v26
 ; LMULMAX1-RV64-NEXT:    vor.vv v26, v27, v26
 ; LMULMAX1-RV64-NEXT:    vsrl.vx v27, v25, t0
@@ -1402,13 +1402,13 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX1-RV64-NEXT:    vor.vv v28, v29, v28
 ; LMULMAX1-RV64-NEXT:    vor.vv v27, v28, v27
 ; LMULMAX1-RV64-NEXT:    vsll.vi v28, v25, 8
-; LMULMAX1-RV64-NEXT:    vand.vx v28, v28, a2
+; LMULMAX1-RV64-NEXT:    vand.vx v28, v28, a5
 ; LMULMAX1-RV64-NEXT:    vsll.vi v29, v25, 24
-; LMULMAX1-RV64-NEXT:    vand.vx v29, v29, a3
+; LMULMAX1-RV64-NEXT:    vand.vx v29, v29, a2
 ; LMULMAX1-RV64-NEXT:    vor.vv v28, v29, v28
 ; LMULMAX1-RV64-NEXT:    vsll.vx v29, v25, t0
 ; LMULMAX1-RV64-NEXT:    vsll.vx v25, v25, t1
-; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a1
+; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a3
 ; LMULMAX1-RV64-NEXT:    vor.vv v25, v29, v25
 ; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v28
 ; LMULMAX1-RV64-NEXT:    vor.vv v25, v25, v27
@@ -1423,8 +1423,8 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX1-RV64-NEXT:    vsll.vi v25, v25, 2
 ; LMULMAX1-RV64-NEXT:    vor.vv v25, v27, v25
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v27, v25, 1
-; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, a5
-; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a5
+; LMULMAX1-RV64-NEXT:    vand.vx v27, v27, a1
+; LMULMAX1-RV64-NEXT:    vand.vx v25, v25, a1
 ; LMULMAX1-RV64-NEXT:    vadd.vv v25, v25, v25
 ; LMULMAX1-RV64-NEXT:    vor.vv v25, v27, v25
 ; LMULMAX1-RV64-NEXT:    vse64.v v25, (a0)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
index 4fb8ee85c6838..b1535a8bf8836 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
@@ -2131,16 +2131,16 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX1-RV32-NEXT:    sw a1, 44(sp)
 ; LMULMAX1-RV32-NEXT:    addi a7, zero, 32
 ; LMULMAX1-RV32-NEXT:    vsrl.vx v26, v26, a7
-; LMULMAX1-RV32-NEXT:    vmv.x.s a4, v26
-; LMULMAX1-RV32-NEXT:    srli a5, a4, 8
+; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
+; LMULMAX1-RV32-NEXT:    srli a5, a1, 8
 ; LMULMAX1-RV32-NEXT:    and a5, a5, a2
-; LMULMAX1-RV32-NEXT:    srli a1, a4, 24
-; LMULMAX1-RV32-NEXT:    or a1, a5, a1
-; LMULMAX1-RV32-NEXT:    slli a5, a4, 8
+; LMULMAX1-RV32-NEXT:    srli a4, a1, 24
+; LMULMAX1-RV32-NEXT:    or a4, a5, a4
+; LMULMAX1-RV32-NEXT:    slli a5, a1, 8
 ; LMULMAX1-RV32-NEXT:    and a5, a5, a3
-; LMULMAX1-RV32-NEXT:    slli a4, a4, 24
-; LMULMAX1-RV32-NEXT:    or a4, a4, a5
-; LMULMAX1-RV32-NEXT:    or a1, a4, a1
+; LMULMAX1-RV32-NEXT:    slli a1, a1, 24
+; LMULMAX1-RV32-NEXT:    or a1, a1, a5
+; LMULMAX1-RV32-NEXT:    or a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    sw a1, 32(sp)
 ; LMULMAX1-RV32-NEXT:    vsrl.vx v26, v27, a7
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
index 60d4b967a5de2..1a0e8647a5c56 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
@@ -2114,80 +2114,80 @@ define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX2-RV32-NEXT:    addi a2, a1, -1
 ; LMULMAX2-RV32-NEXT:    not a1, a1
-; LMULMAX2-RV32-NEXT:    and a1, a1, a2
-; LMULMAX2-RV32-NEXT:    srli a2, a1, 1
-; LMULMAX2-RV32-NEXT:    lui a3, 349525
-; LMULMAX2-RV32-NEXT:    addi a6, a3, 1365
-; LMULMAX2-RV32-NEXT:    and a2, a2, a6
-; LMULMAX2-RV32-NEXT:    sub a1, a1, a2
+; LMULMAX2-RV32-NEXT:    and a2, a1, a2
+; LMULMAX2-RV32-NEXT:    srli a3, a2, 1
+; LMULMAX2-RV32-NEXT:    lui a1, 349525
+; LMULMAX2-RV32-NEXT:    addi a6, a1, 1365
+; LMULMAX2-RV32-NEXT:    and a3, a3, a6
+; LMULMAX2-RV32-NEXT:    sub a3, a2, a3
 ; LMULMAX2-RV32-NEXT:    lui a2, 209715
 ; LMULMAX2-RV32-NEXT:    addi a2, a2, 819
+; LMULMAX2-RV32-NEXT:    and a4, a3, a2
+; LMULMAX2-RV32-NEXT:    srli a3, a3, 2
+; LMULMAX2-RV32-NEXT:    and a3, a3, a2
+; LMULMAX2-RV32-NEXT:    add a3, a4, a3
+; LMULMAX2-RV32-NEXT:    srli a4, a3, 4
+; LMULMAX2-RV32-NEXT:    add a4, a3, a4
+; LMULMAX2-RV32-NEXT:    lui a3, 61681
+; LMULMAX2-RV32-NEXT:    addi a3, a3, -241
+; LMULMAX2-RV32-NEXT:    and a4, a4, a3
+; LMULMAX2-RV32-NEXT:    lui a5, 4112
+; LMULMAX2-RV32-NEXT:    addi a5, a5, 257
+; LMULMAX2-RV32-NEXT:    mul a4, a4, a5
+; LMULMAX2-RV32-NEXT:    srli a4, a4, 24
+; LMULMAX2-RV32-NEXT:    sw a4, 16(sp)
+; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 3
+; LMULMAX2-RV32-NEXT:    vmv.x.s a4, v26
+; LMULMAX2-RV32-NEXT:    addi a1, a4, -1
+; LMULMAX2-RV32-NEXT:    not a4, a4
+; LMULMAX2-RV32-NEXT:    and a1, a4, a1
+; LMULMAX2-RV32-NEXT:    srli a4, a1, 1
+; LMULMAX2-RV32-NEXT:    and a4, a4, a6
+; LMULMAX2-RV32-NEXT:    sub a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    and a4, a1, a2
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 2
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a2
 ; LMULMAX2-RV32-NEXT:    add a1, a4, a1
 ; LMULMAX2-RV32-NEXT:    srli a4, a1, 4
 ; LMULMAX2-RV32-NEXT:    add a1, a1, a4
-; LMULMAX2-RV32-NEXT:    lui a4, 61681
-; LMULMAX2-RV32-NEXT:    addi a4, a4, -241
-; LMULMAX2-RV32-NEXT:    and a1, a1, a4
-; LMULMAX2-RV32-NEXT:    lui a5, 4112
-; LMULMAX2-RV32-NEXT:    addi a5, a5, 257
-; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
-; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX2-RV32-NEXT:    sw a1, 16(sp)
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
-; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 3
-; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
-; LMULMAX2-RV32-NEXT:    addi a3, a1, -1
-; LMULMAX2-RV32-NEXT:    not a1, a1
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a3
-; LMULMAX2-RV32-NEXT:    srli a3, a1, 1
-; LMULMAX2-RV32-NEXT:    and a3, a3, a6
-; LMULMAX2-RV32-NEXT:    sub a1, a1, a3
-; LMULMAX2-RV32-NEXT:    and a3, a1, a2
-; LMULMAX2-RV32-NEXT:    srli a1, a1, 2
-; LMULMAX2-RV32-NEXT:    and a1, a1, a2
-; LMULMAX2-RV32-NEXT:    add a1, a3, a1
-; LMULMAX2-RV32-NEXT:    srli a3, a1, 4
-; LMULMAX2-RV32-NEXT:    add a1, a1, a3
-; LMULMAX2-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    sw a1, 28(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v26
-; LMULMAX2-RV32-NEXT:    addi a3, a1, -1
+; LMULMAX2-RV32-NEXT:    addi a4, a1, -1
 ; LMULMAX2-RV32-NEXT:    not a1, a1
-; LMULMAX2-RV32-NEXT:    and a1, a1, a3
-; LMULMAX2-RV32-NEXT:    srli a3, a1, 1
-; LMULMAX2-RV32-NEXT:    and a3, a3, a6
-; LMULMAX2-RV32-NEXT:    sub a1, a1, a3
-; LMULMAX2-RV32-NEXT:    and a3, a1, a2
+; LMULMAX2-RV32-NEXT:    and a1, a1, a4
+; LMULMAX2-RV32-NEXT:    srli a4, a1, 1
+; LMULMAX2-RV32-NEXT:    and a4, a4, a6
+; LMULMAX2-RV32-NEXT:    sub a1, a1, a4
+; LMULMAX2-RV32-NEXT:    and a4, a1, a2
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 2
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a2
-; LMULMAX2-RV32-NEXT:    add a1, a3, a1
-; LMULMAX2-RV32-NEXT:    srli a3, a1, 4
-; LMULMAX2-RV32-NEXT:    add a1, a1, a3
-; LMULMAX2-RV32-NEXT:    and a1, a1, a4
+; LMULMAX2-RV32-NEXT:    add a1, a4, a1
+; LMULMAX2-RV32-NEXT:    srli a4, a1, 4
+; LMULMAX2-RV32-NEXT:    add a1, a1, a4
+; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX2-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX2-RV32-NEXT:    vmv.x.s a1, v25
-; LMULMAX2-RV32-NEXT:    addi a3, a1, -1
+; LMULMAX2-RV32-NEXT:    addi a4, a1, -1
 ; LMULMAX2-RV32-NEXT:    not a1, a1
-; LMULMAX2-RV32-NEXT:    and a1, a1, a3
-; LMULMAX2-RV32-NEXT:    srli a3, a1, 1
-; LMULMAX2-RV32-NEXT:    and a3, a3, a6
-; LMULMAX2-RV32-NEXT:    sub a1, a1, a3
-; LMULMAX2-RV32-NEXT:    and a3, a1, a2
+; LMULMAX2-RV32-NEXT:    and a1, a1, a4
+; LMULMAX2-RV32-NEXT:    srli a4, a1, 1
+; LMULMAX2-RV32-NEXT:    and a4, a4, a6
+; LMULMAX2-RV32-NEXT:    sub a1, a1, a4
+; LMULMAX2-RV32-NEXT:    and a4, a1, a2
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 2
 ; LMULMAX2-RV32-NEXT:    and a1, a1, a2
-; LMULMAX2-RV32-NEXT:    add a1, a3, a1
+; LMULMAX2-RV32-NEXT:    add a1, a4, a1
 ; LMULMAX2-RV32-NEXT:    srli a2, a1, 4
 ; LMULMAX2-RV32-NEXT:    add a1, a1, a2
-; LMULMAX2-RV32-NEXT:    and a1, a1, a4
+; LMULMAX2-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX2-RV32-NEXT:    sw a1, 20(sp)
@@ -2237,79 +2237,79 @@ define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a4, a4, a3
 ; LMULMAX2-RV64-NEXT:    add a4, a5, a4
 ; LMULMAX2-RV64-NEXT:    srli a5, a4, 4
-; LMULMAX2-RV64-NEXT:    add a4, a4, a5
-; LMULMAX2-RV64-NEXT:    lui a5, 3855
-; LMULMAX2-RV64-NEXT:    addiw a5, a5, 241
-; LMULMAX2-RV64-NEXT:    slli a5, a5, 12
-; LMULMAX2-RV64-NEXT:    addi a5, a5, -241
-; LMULMAX2-RV64-NEXT:    slli a5, a5, 12
-; LMULMAX2-RV64-NEXT:    addi a5, a5, 241
-; LMULMAX2-RV64-NEXT:    slli a5, a5, 12
-; LMULMAX2-RV64-NEXT:    addi a7, a5, -241
-; LMULMAX2-RV64-NEXT:    and a4, a4, a7
-; LMULMAX2-RV64-NEXT:    lui a2, 4112
-; LMULMAX2-RV64-NEXT:    addiw a2, a2, 257
-; LMULMAX2-RV64-NEXT:    slli a2, a2, 16
-; LMULMAX2-RV64-NEXT:    addi a2, a2, 257
-; LMULMAX2-RV64-NEXT:    slli a2, a2, 16
-; LMULMAX2-RV64-NEXT:    addi a2, a2, 257
-; LMULMAX2-RV64-NEXT:    mul a4, a4, a2
-; LMULMAX2-RV64-NEXT:    srli a4, a4, 56
-; LMULMAX2-RV64-NEXT:    sw a4, 28(sp)
+; LMULMAX2-RV64-NEXT:    add a5, a4, a5
+; LMULMAX2-RV64-NEXT:    lui a4, 3855
+; LMULMAX2-RV64-NEXT:    addiw a4, a4, 241
+; LMULMAX2-RV64-NEXT:    slli a4, a4, 12
+; LMULMAX2-RV64-NEXT:    addi a4, a4, -241
+; LMULMAX2-RV64-NEXT:    slli a4, a4, 12
+; LMULMAX2-RV64-NEXT:    addi a4, a4, 241
+; LMULMAX2-RV64-NEXT:    slli a4, a4, 12
+; LMULMAX2-RV64-NEXT:    addi a7, a4, -241
+; LMULMAX2-RV64-NEXT:    and a2, a5, a7
+; LMULMAX2-RV64-NEXT:    lui a5, 4112
+; LMULMAX2-RV64-NEXT:    addiw a5, a5, 257
+; LMULMAX2-RV64-NEXT:    slli a5, a5, 16
+; LMULMAX2-RV64-NEXT:    addi a5, a5, 257
+; LMULMAX2-RV64-NEXT:    slli a5, a5, 16
+; LMULMAX2-RV64-NEXT:    addi a5, a5, 257
+; LMULMAX2-RV64-NEXT:    mul a2, a2, a5
+; LMULMAX2-RV64-NEXT:    srli a2, a2, 56
+; LMULMAX2-RV64-NEXT:    sw a2, 28(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 2
-; LMULMAX2-RV64-NEXT:    vmv.x.s a4, v26
-; LMULMAX2-RV64-NEXT:    or a4, a4, a1
-; LMULMAX2-RV64-NEXT:    addi a5, a4, -1
-; LMULMAX2-RV64-NEXT:    not a4, a4
-; LMULMAX2-RV64-NEXT:    and a4, a4, a5
-; LMULMAX2-RV64-NEXT:    srli a5, a4, 1
-; LMULMAX2-RV64-NEXT:    and a5, a5, a6
-; LMULMAX2-RV64-NEXT:    sub a4, a4, a5
-; LMULMAX2-RV64-NEXT:    and a5, a4, a3
-; LMULMAX2-RV64-NEXT:    srli a4, a4, 2
-; LMULMAX2-RV64-NEXT:    and a4, a4, a3
-; LMULMAX2-RV64-NEXT:    add a4, a5, a4
-; LMULMAX2-RV64-NEXT:    srli a5, a4, 4
-; LMULMAX2-RV64-NEXT:    add a4, a4, a5
-; LMULMAX2-RV64-NEXT:    and a4, a4, a7
-; LMULMAX2-RV64-NEXT:    mul a4, a4, a2
-; LMULMAX2-RV64-NEXT:    srli a4, a4, 56
-; LMULMAX2-RV64-NEXT:    sw a4, 24(sp)
+; LMULMAX2-RV64-NEXT:    vmv.x.s a2, v26
+; LMULMAX2-RV64-NEXT:    or a2, a2, a1
+; LMULMAX2-RV64-NEXT:    addi a4, a2, -1
+; LMULMAX2-RV64-NEXT:    not a2, a2
+; LMULMAX2-RV64-NEXT:    and a2, a2, a4
+; LMULMAX2-RV64-NEXT:    srli a4, a2, 1
+; LMULMAX2-RV64-NEXT:    and a4, a4, a6
+; LMULMAX2-RV64-NEXT:    sub a2, a2, a4
+; LMULMAX2-RV64-NEXT:    and a4, a2, a3
+; LMULMAX2-RV64-NEXT:    srli a2, a2, 2
+; LMULMAX2-RV64-NEXT:    and a2, a2, a3
+; LMULMAX2-RV64-NEXT:    add a2, a4, a2
+; LMULMAX2-RV64-NEXT:    srli a4, a2, 4
+; LMULMAX2-RV64-NEXT:    add a2, a2, a4
+; LMULMAX2-RV64-NEXT:    and a2, a2, a7
+; LMULMAX2-RV64-NEXT:    mul a2, a2, a5
+; LMULMAX2-RV64-NEXT:    srli a2, a2, 56
+; LMULMAX2-RV64-NEXT:    sw a2, 24(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 1
-; LMULMAX2-RV64-NEXT:    vmv.x.s a4, v26
-; LMULMAX2-RV64-NEXT:    or a4, a4, a1
-; LMULMAX2-RV64-NEXT:    addi a5, a4, -1
-; LMULMAX2-RV64-NEXT:    not a4, a4
-; LMULMAX2-RV64-NEXT:    and a4, a4, a5
-; LMULMAX2-RV64-NEXT:    srli a5, a4, 1
-; LMULMAX2-RV64-NEXT:    and a5, a5, a6
-; LMULMAX2-RV64-NEXT:    sub a4, a4, a5
-; LMULMAX2-RV64-NEXT:    and a5, a4, a3
-; LMULMAX2-RV64-NEXT:    srli a4, a4, 2
-; LMULMAX2-RV64-NEXT:    and a4, a4, a3
-; LMULMAX2-RV64-NEXT:    add a4, a5, a4
-; LMULMAX2-RV64-NEXT:    srli a5, a4, 4
-; LMULMAX2-RV64-NEXT:    add a4, a4, a5
-; LMULMAX2-RV64-NEXT:    and a4, a4, a7
-; LMULMAX2-RV64-NEXT:    mul a4, a4, a2
-; LMULMAX2-RV64-NEXT:    srli a4, a4, 56
-; LMULMAX2-RV64-NEXT:    sw a4, 20(sp)
-; LMULMAX2-RV64-NEXT:    vmv.x.s a4, v25
-; LMULMAX2-RV64-NEXT:    or a1, a4, a1
-; LMULMAX2-RV64-NEXT:    addi a4, a1, -1
-; LMULMAX2-RV64-NEXT:    not a1, a1
-; LMULMAX2-RV64-NEXT:    and a1, a1, a4
-; LMULMAX2-RV64-NEXT:    srli a4, a1, 1
+; LMULMAX2-RV64-NEXT:    vmv.x.s a2, v26
+; LMULMAX2-RV64-NEXT:    or a2, a2, a1
+; LMULMAX2-RV64-NEXT:    addi a4, a2, -1
+; LMULMAX2-RV64-NEXT:    not a2, a2
+; LMULMAX2-RV64-NEXT:    and a2, a2, a4
+; LMULMAX2-RV64-NEXT:    srli a4, a2, 1
 ; LMULMAX2-RV64-NEXT:    and a4, a4, a6
-; LMULMAX2-RV64-NEXT:    sub a1, a1, a4
-; LMULMAX2-RV64-NEXT:    and a4, a1, a3
+; LMULMAX2-RV64-NEXT:    sub a2, a2, a4
+; LMULMAX2-RV64-NEXT:    and a4, a2, a3
+; LMULMAX2-RV64-NEXT:    srli a2, a2, 2
+; LMULMAX2-RV64-NEXT:    and a2, a2, a3
+; LMULMAX2-RV64-NEXT:    add a2, a4, a2
+; LMULMAX2-RV64-NEXT:    srli a4, a2, 4
+; LMULMAX2-RV64-NEXT:    add a2, a2, a4
+; LMULMAX2-RV64-NEXT:    and a2, a2, a7
+; LMULMAX2-RV64-NEXT:    mul a2, a2, a5
+; LMULMAX2-RV64-NEXT:    srli a2, a2, 56
+; LMULMAX2-RV64-NEXT:    sw a2, 20(sp)
+; LMULMAX2-RV64-NEXT:    vmv.x.s a2, v25
+; LMULMAX2-RV64-NEXT:    or a1, a2, a1
+; LMULMAX2-RV64-NEXT:    addi a2, a1, -1
+; LMULMAX2-RV64-NEXT:    not a1, a1
+; LMULMAX2-RV64-NEXT:    and a1, a1, a2
+; LMULMAX2-RV64-NEXT:    srli a2, a1, 1
+; LMULMAX2-RV64-NEXT:    and a2, a2, a6
+; LMULMAX2-RV64-NEXT:    sub a1, a1, a2
+; LMULMAX2-RV64-NEXT:    and a2, a1, a3
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 2
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
-; LMULMAX2-RV64-NEXT:    add a1, a4, a1
-; LMULMAX2-RV64-NEXT:    srli a3, a1, 4
-; LMULMAX2-RV64-NEXT:    add a1, a1, a3
+; LMULMAX2-RV64-NEXT:    add a1, a2, a1
+; LMULMAX2-RV64-NEXT:    srli a2, a1, 4
+; LMULMAX2-RV64-NEXT:    add a1, a1, a2
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a7
-; LMULMAX2-RV64-NEXT:    mul a1, a1, a2
+; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    sw a1, 16(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
@@ -2328,80 +2328,80 @@ define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV32-NEXT:    addi a2, a1, -1
 ; LMULMAX1-RV32-NEXT:    not a1, a1
-; LMULMAX1-RV32-NEXT:    and a1, a1, a2
-; LMULMAX1-RV32-NEXT:    srli a2, a1, 1
-; LMULMAX1-RV32-NEXT:    lui a3, 349525
-; LMULMAX1-RV32-NEXT:    addi a6, a3, 1365
-; LMULMAX1-RV32-NEXT:    and a2, a2, a6
-; LMULMAX1-RV32-NEXT:    sub a1, a1, a2
+; LMULMAX1-RV32-NEXT:    and a2, a1, a2
+; LMULMAX1-RV32-NEXT:    srli a3, a2, 1
+; LMULMAX1-RV32-NEXT:    lui a1, 349525
+; LMULMAX1-RV32-NEXT:    addi a6, a1, 1365
+; LMULMAX1-RV32-NEXT:    and a3, a3, a6
+; LMULMAX1-RV32-NEXT:    sub a3, a2, a3
 ; LMULMAX1-RV32-NEXT:    lui a2, 209715
 ; LMULMAX1-RV32-NEXT:    addi a2, a2, 819
+; LMULMAX1-RV32-NEXT:    and a4, a3, a2
+; LMULMAX1-RV32-NEXT:    srli a3, a3, 2
+; LMULMAX1-RV32-NEXT:    and a3, a3, a2
+; LMULMAX1-RV32-NEXT:    add a3, a4, a3
+; LMULMAX1-RV32-NEXT:    srli a4, a3, 4
+; LMULMAX1-RV32-NEXT:    add a4, a3, a4
+; LMULMAX1-RV32-NEXT:    lui a3, 61681
+; LMULMAX1-RV32-NEXT:    addi a3, a3, -241
+; LMULMAX1-RV32-NEXT:    and a4, a4, a3
+; LMULMAX1-RV32-NEXT:    lui a5, 4112
+; LMULMAX1-RV32-NEXT:    addi a5, a5, 257
+; LMULMAX1-RV32-NEXT:    mul a4, a4, a5
+; LMULMAX1-RV32-NEXT:    srli a4, a4, 24
+; LMULMAX1-RV32-NEXT:    sw a4, 16(sp)
+; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 3
+; LMULMAX1-RV32-NEXT:    vmv.x.s a4, v26
+; LMULMAX1-RV32-NEXT:    addi a1, a4, -1
+; LMULMAX1-RV32-NEXT:    not a4, a4
+; LMULMAX1-RV32-NEXT:    and a1, a4, a1
+; LMULMAX1-RV32-NEXT:    srli a4, a1, 1
+; LMULMAX1-RV32-NEXT:    and a4, a4, a6
+; LMULMAX1-RV32-NEXT:    sub a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    and a4, a1, a2
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 2
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a2
 ; LMULMAX1-RV32-NEXT:    add a1, a4, a1
 ; LMULMAX1-RV32-NEXT:    srli a4, a1, 4
 ; LMULMAX1-RV32-NEXT:    add a1, a1, a4
-; LMULMAX1-RV32-NEXT:    lui a4, 61681
-; LMULMAX1-RV32-NEXT:    addi a4, a4, -241
-; LMULMAX1-RV32-NEXT:    and a1, a1, a4
-; LMULMAX1-RV32-NEXT:    lui a5, 4112
-; LMULMAX1-RV32-NEXT:    addi a5, a5, 257
-; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
-; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
-; LMULMAX1-RV32-NEXT:    sw a1, 16(sp)
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
-; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 3
-; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
-; LMULMAX1-RV32-NEXT:    addi a3, a1, -1
-; LMULMAX1-RV32-NEXT:    not a1, a1
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a3
-; LMULMAX1-RV32-NEXT:    srli a3, a1, 1
-; LMULMAX1-RV32-NEXT:    and a3, a3, a6
-; LMULMAX1-RV32-NEXT:    sub a1, a1, a3
-; LMULMAX1-RV32-NEXT:    and a3, a1, a2
-; LMULMAX1-RV32-NEXT:    srli a1, a1, 2
-; LMULMAX1-RV32-NEXT:    and a1, a1, a2
-; LMULMAX1-RV32-NEXT:    add a1, a3, a1
-; LMULMAX1-RV32-NEXT:    srli a3, a1, 4
-; LMULMAX1-RV32-NEXT:    add a1, a1, a3
-; LMULMAX1-RV32-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    sw a1, 28(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v26
-; LMULMAX1-RV32-NEXT:    addi a3, a1, -1
+; LMULMAX1-RV32-NEXT:    addi a4, a1, -1
 ; LMULMAX1-RV32-NEXT:    not a1, a1
-; LMULMAX1-RV32-NEXT:    and a1, a1, a3
-; LMULMAX1-RV32-NEXT:    srli a3, a1, 1
-; LMULMAX1-RV32-NEXT:    and a3, a3, a6
-; LMULMAX1-RV32-NEXT:    sub a1, a1, a3
-; LMULMAX1-RV32-NEXT:    and a3, a1, a2
+; LMULMAX1-RV32-NEXT:    and a1, a1, a4
+; LMULMAX1-RV32-NEXT:    srli a4, a1, 1
+; LMULMAX1-RV32-NEXT:    and a4, a4, a6
+; LMULMAX1-RV32-NEXT:    sub a1, a1, a4
+; LMULMAX1-RV32-NEXT:    and a4, a1, a2
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 2
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a2
-; LMULMAX1-RV32-NEXT:    add a1, a3, a1
-; LMULMAX1-RV32-NEXT:    srli a3, a1, 4
-; LMULMAX1-RV32-NEXT:    add a1, a1, a3
-; LMULMAX1-RV32-NEXT:    and a1, a1, a4
+; LMULMAX1-RV32-NEXT:    add a1, a4, a1
+; LMULMAX1-RV32-NEXT:    srli a4, a1, 4
+; LMULMAX1-RV32-NEXT:    add a1, a1, a4
+; LMULMAX1-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    sw a1, 24(sp)
 ; LMULMAX1-RV32-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV32-NEXT:    vmv.x.s a1, v25
-; LMULMAX1-RV32-NEXT:    addi a3, a1, -1
+; LMULMAX1-RV32-NEXT:    addi a4, a1, -1
 ; LMULMAX1-RV32-NEXT:    not a1, a1
-; LMULMAX1-RV32-NEXT:    and a1, a1, a3
-; LMULMAX1-RV32-NEXT:    srli a3, a1, 1
-; LMULMAX1-RV32-NEXT:    and a3, a3, a6
-; LMULMAX1-RV32-NEXT:    sub a1, a1, a3
-; LMULMAX1-RV32-NEXT:    and a3, a1, a2
+; LMULMAX1-RV32-NEXT:    and a1, a1, a4
+; LMULMAX1-RV32-NEXT:    srli a4, a1, 1
+; LMULMAX1-RV32-NEXT:    and a4, a4, a6
+; LMULMAX1-RV32-NEXT:    sub a1, a1, a4
+; LMULMAX1-RV32-NEXT:    and a4, a1, a2
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 2
 ; LMULMAX1-RV32-NEXT:    and a1, a1, a2
-; LMULMAX1-RV32-NEXT:    add a1, a3, a1
+; LMULMAX1-RV32-NEXT:    add a1, a4, a1
 ; LMULMAX1-RV32-NEXT:    srli a2, a1, 4
 ; LMULMAX1-RV32-NEXT:    add a1, a1, a2
-; LMULMAX1-RV32-NEXT:    and a1, a1, a4
+; LMULMAX1-RV32-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV32-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV32-NEXT:    srli a1, a1, 24
 ; LMULMAX1-RV32-NEXT:    sw a1, 20(sp)
@@ -2451,79 +2451,79 @@ define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a4, a4, a3
 ; LMULMAX1-RV64-NEXT:    add a4, a5, a4
 ; LMULMAX1-RV64-NEXT:    srli a5, a4, 4
-; LMULMAX1-RV64-NEXT:    add a4, a4, a5
-; LMULMAX1-RV64-NEXT:    lui a5, 3855
-; LMULMAX1-RV64-NEXT:    addiw a5, a5, 241
-; LMULMAX1-RV64-NEXT:    slli a5, a5, 12
-; LMULMAX1-RV64-NEXT:    addi a5, a5, -241
-; LMULMAX1-RV64-NEXT:    slli a5, a5, 12
-; LMULMAX1-RV64-NEXT:    addi a5, a5, 241
-; LMULMAX1-RV64-NEXT:    slli a5, a5, 12
-; LMULMAX1-RV64-NEXT:    addi a7, a5, -241
-; LMULMAX1-RV64-NEXT:    and a4, a4, a7
-; LMULMAX1-RV64-NEXT:    lui a2, 4112
-; LMULMAX1-RV64-NEXT:    addiw a2, a2, 257
-; LMULMAX1-RV64-NEXT:    slli a2, a2, 16
-; LMULMAX1-RV64-NEXT:    addi a2, a2, 257
-; LMULMAX1-RV64-NEXT:    slli a2, a2, 16
-; LMULMAX1-RV64-NEXT:    addi a2, a2, 257
-; LMULMAX1-RV64-NEXT:    mul a4, a4, a2
-; LMULMAX1-RV64-NEXT:    srli a4, a4, 56
-; LMULMAX1-RV64-NEXT:    sw a4, 28(sp)
+; LMULMAX1-RV64-NEXT:    add a5, a4, a5
+; LMULMAX1-RV64-NEXT:    lui a4, 3855
+; LMULMAX1-RV64-NEXT:    addiw a4, a4, 241
+; LMULMAX1-RV64-NEXT:    slli a4, a4, 12
+; LMULMAX1-RV64-NEXT:    addi a4, a4, -241
+; LMULMAX1-RV64-NEXT:    slli a4, a4, 12
+; LMULMAX1-RV64-NEXT:    addi a4, a4, 241
+; LMULMAX1-RV64-NEXT:    slli a4, a4, 12
+; LMULMAX1-RV64-NEXT:    addi a7, a4, -241
+; LMULMAX1-RV64-NEXT:    and a2, a5, a7
+; LMULMAX1-RV64-NEXT:    lui a5, 4112
+; LMULMAX1-RV64-NEXT:    addiw a5, a5, 257
+; LMULMAX1-RV64-NEXT:    slli a5, a5, 16
+; LMULMAX1-RV64-NEXT:    addi a5, a5, 257
+; LMULMAX1-RV64-NEXT:    slli a5, a5, 16
+; LMULMAX1-RV64-NEXT:    addi a5, a5, 257
+; LMULMAX1-RV64-NEXT:    mul a2, a2, a5
+; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
+; LMULMAX1-RV64-NEXT:    sw a2, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
-; LMULMAX1-RV64-NEXT:    vmv.x.s a4, v26
-; LMULMAX1-RV64-NEXT:    or a4, a4, a1
-; LMULMAX1-RV64-NEXT:    addi a5, a4, -1
-; LMULMAX1-RV64-NEXT:    not a4, a4
-; LMULMAX1-RV64-NEXT:    and a4, a4, a5
-; LMULMAX1-RV64-NEXT:    srli a5, a4, 1
-; LMULMAX1-RV64-NEXT:    and a5, a5, a6
-; LMULMAX1-RV64-NEXT:    sub a4, a4, a5
-; LMULMAX1-RV64-NEXT:    and a5, a4, a3
-; LMULMAX1-RV64-NEXT:    srli a4, a4, 2
-; LMULMAX1-RV64-NEXT:    and a4, a4, a3
-; LMULMAX1-RV64-NEXT:    add a4, a5, a4
-; LMULMAX1-RV64-NEXT:    srli a5, a4, 4
-; LMULMAX1-RV64-NEXT:    add a4, a4, a5
-; LMULMAX1-RV64-NEXT:    and a4, a4, a7
-; LMULMAX1-RV64-NEXT:    mul a4, a4, a2
-; LMULMAX1-RV64-NEXT:    srli a4, a4, 56
-; LMULMAX1-RV64-NEXT:    sw a4, 24(sp)
+; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
+; LMULMAX1-RV64-NEXT:    or a2, a2, a1
+; LMULMAX1-RV64-NEXT:    addi a4, a2, -1
+; LMULMAX1-RV64-NEXT:    not a2, a2
+; LMULMAX1-RV64-NEXT:    and a2, a2, a4
+; LMULMAX1-RV64-NEXT:    srli a4, a2, 1
+; LMULMAX1-RV64-NEXT:    and a4, a4, a6
+; LMULMAX1-RV64-NEXT:    sub a2, a2, a4
+; LMULMAX1-RV64-NEXT:    and a4, a2, a3
+; LMULMAX1-RV64-NEXT:    srli a2, a2, 2
+; LMULMAX1-RV64-NEXT:    and a2, a2, a3
+; LMULMAX1-RV64-NEXT:    add a2, a4, a2
+; LMULMAX1-RV64-NEXT:    srli a4, a2, 4
+; LMULMAX1-RV64-NEXT:    add a2, a2, a4
+; LMULMAX1-RV64-NEXT:    and a2, a2, a7
+; LMULMAX1-RV64-NEXT:    mul a2, a2, a5
+; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
+; LMULMAX1-RV64-NEXT:    sw a2, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 1
-; LMULMAX1-RV64-NEXT:    vmv.x.s a4, v26
-; LMULMAX1-RV64-NEXT:    or a4, a4, a1
-; LMULMAX1-RV64-NEXT:    addi a5, a4, -1
-; LMULMAX1-RV64-NEXT:    not a4, a4
-; LMULMAX1-RV64-NEXT:    and a4, a4, a5
-; LMULMAX1-RV64-NEXT:    srli a5, a4, 1
-; LMULMAX1-RV64-NEXT:    and a5, a5, a6
-; LMULMAX1-RV64-NEXT:    sub a4, a4, a5
-; LMULMAX1-RV64-NEXT:    and a5, a4, a3
-; LMULMAX1-RV64-NEXT:    srli a4, a4, 2
-; LMULMAX1-RV64-NEXT:    and a4, a4, a3
-; LMULMAX1-RV64-NEXT:    add a4, a5, a4
-; LMULMAX1-RV64-NEXT:    srli a5, a4, 4
-; LMULMAX1-RV64-NEXT:    add a4, a4, a5
-; LMULMAX1-RV64-NEXT:    and a4, a4, a7
-; LMULMAX1-RV64-NEXT:    mul a4, a4, a2
-; LMULMAX1-RV64-NEXT:    srli a4, a4, 56
-; LMULMAX1-RV64-NEXT:    sw a4, 20(sp)
-; LMULMAX1-RV64-NEXT:    vmv.x.s a4, v25
-; LMULMAX1-RV64-NEXT:    or a1, a4, a1
-; LMULMAX1-RV64-NEXT:    addi a4, a1, -1
-; LMULMAX1-RV64-NEXT:    not a1, a1
-; LMULMAX1-RV64-NEXT:    and a1, a1, a4
-; LMULMAX1-RV64-NEXT:    srli a4, a1, 1
+; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
+; LMULMAX1-RV64-NEXT:    or a2, a2, a1
+; LMULMAX1-RV64-NEXT:    addi a4, a2, -1
+; LMULMAX1-RV64-NEXT:    not a2, a2
+; LMULMAX1-RV64-NEXT:    and a2, a2, a4
+; LMULMAX1-RV64-NEXT:    srli a4, a2, 1
 ; LMULMAX1-RV64-NEXT:    and a4, a4, a6
-; LMULMAX1-RV64-NEXT:    sub a1, a1, a4
-; LMULMAX1-RV64-NEXT:    and a4, a1, a3
+; LMULMAX1-RV64-NEXT:    sub a2, a2, a4
+; LMULMAX1-RV64-NEXT:    and a4, a2, a3
+; LMULMAX1-RV64-NEXT:    srli a2, a2, 2
+; LMULMAX1-RV64-NEXT:    and a2, a2, a3
+; LMULMAX1-RV64-NEXT:    add a2, a4, a2
+; LMULMAX1-RV64-NEXT:    srli a4, a2, 4
+; LMULMAX1-RV64-NEXT:    add a2, a2, a4
+; LMULMAX1-RV64-NEXT:    and a2, a2, a7
+; LMULMAX1-RV64-NEXT:    mul a2, a2, a5
+; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
+; LMULMAX1-RV64-NEXT:    sw a2, 20(sp)
+; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v25
+; LMULMAX1-RV64-NEXT:    or a1, a2, a1
+; LMULMAX1-RV64-NEXT:    addi a2, a1, -1
+; LMULMAX1-RV64-NEXT:    not a1, a1
+; LMULMAX1-RV64-NEXT:    and a1, a1, a2
+; LMULMAX1-RV64-NEXT:    srli a2, a1, 1
+; LMULMAX1-RV64-NEXT:    and a2, a2, a6
+; LMULMAX1-RV64-NEXT:    sub a1, a1, a2
+; LMULMAX1-RV64-NEXT:    and a2, a1, a3
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 2
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
-; LMULMAX1-RV64-NEXT:    add a1, a4, a1
-; LMULMAX1-RV64-NEXT:    srli a3, a1, 4
-; LMULMAX1-RV64-NEXT:    add a1, a1, a3
+; LMULMAX1-RV64-NEXT:    add a1, a2, a1
+; LMULMAX1-RV64-NEXT:    srli a2, a1, 4
+; LMULMAX1-RV64-NEXT:    add a1, a1, a2
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a7
-; LMULMAX1-RV64-NEXT:    mul a1, a1, a2
+; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    sw a1, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
@@ -7902,75 +7902,75 @@ define void @cttz_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a3, a3, a2
 ; LMULMAX2-RV64-NEXT:    add a3, a4, a3
 ; LMULMAX2-RV64-NEXT:    srli a4, a3, 4
-; LMULMAX2-RV64-NEXT:    add a3, a3, a4
-; LMULMAX2-RV64-NEXT:    lui a4, 3855
-; LMULMAX2-RV64-NEXT:    addiw a4, a4, 241
-; LMULMAX2-RV64-NEXT:    slli a4, a4, 12
-; LMULMAX2-RV64-NEXT:    addi a4, a4, -241
-; LMULMAX2-RV64-NEXT:    slli a4, a4, 12
-; LMULMAX2-RV64-NEXT:    addi a4, a4, 241
-; LMULMAX2-RV64-NEXT:    slli a4, a4, 12
-; LMULMAX2-RV64-NEXT:    addi a4, a4, -241
-; LMULMAX2-RV64-NEXT:    and a3, a3, a4
+; LMULMAX2-RV64-NEXT:    add a4, a3, a4
+; LMULMAX2-RV64-NEXT:    lui a3, 3855
+; LMULMAX2-RV64-NEXT:    addiw a3, a3, 241
+; LMULMAX2-RV64-NEXT:    slli a3, a3, 12
+; LMULMAX2-RV64-NEXT:    addi a3, a3, -241
+; LMULMAX2-RV64-NEXT:    slli a3, a3, 12
+; LMULMAX2-RV64-NEXT:    addi a3, a3, 241
+; LMULMAX2-RV64-NEXT:    slli a3, a3, 12
+; LMULMAX2-RV64-NEXT:    addi a3, a3, -241
+; LMULMAX2-RV64-NEXT:    and a4, a4, a3
 ; LMULMAX2-RV64-NEXT:    lui a5, 4112
 ; LMULMAX2-RV64-NEXT:    addiw a5, a5, 257
 ; LMULMAX2-RV64-NEXT:    slli a5, a5, 16
 ; LMULMAX2-RV64-NEXT:    addi a5, a5, 257
 ; LMULMAX2-RV64-NEXT:    slli a5, a5, 16
 ; LMULMAX2-RV64-NEXT:    addi a5, a5, 257
-; LMULMAX2-RV64-NEXT:    mul a3, a3, a5
-; LMULMAX2-RV64-NEXT:    srli a3, a3, 56
-; LMULMAX2-RV64-NEXT:    sd a3, 56(sp)
+; LMULMAX2-RV64-NEXT:    mul a4, a4, a5
+; LMULMAX2-RV64-NEXT:    srli a4, a4, 56
+; LMULMAX2-RV64-NEXT:    sd a4, 56(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 2
-; LMULMAX2-RV64-NEXT:    vmv.x.s a3, v28
-; LMULMAX2-RV64-NEXT:    addi a1, a3, -1
-; LMULMAX2-RV64-NEXT:    not a3, a3
-; LMULMAX2-RV64-NEXT:    and a1, a3, a1
-; LMULMAX2-RV64-NEXT:    srli a3, a1, 1
-; LMULMAX2-RV64-NEXT:    and a3, a3, a6
-; LMULMAX2-RV64-NEXT:    sub a1, a1, a3
-; LMULMAX2-RV64-NEXT:    and a3, a1, a2
+; LMULMAX2-RV64-NEXT:    vmv.x.s a4, v28
+; LMULMAX2-RV64-NEXT:    addi a1, a4, -1
+; LMULMAX2-RV64-NEXT:    not a4, a4
+; LMULMAX2-RV64-NEXT:    and a1, a4, a1
+; LMULMAX2-RV64-NEXT:    srli a4, a1, 1
+; LMULMAX2-RV64-NEXT:    and a4, a4, a6
+; LMULMAX2-RV64-NEXT:    sub a1, a1, a4
+; LMULMAX2-RV64-NEXT:    and a4, a1, a2
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 2
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a2
-; LMULMAX2-RV64-NEXT:    add a1, a3, a1
-; LMULMAX2-RV64-NEXT:    srli a3, a1, 4
-; LMULMAX2-RV64-NEXT:    add a1, a1, a3
-; LMULMAX2-RV64-NEXT:    and a1, a1, a4
+; LMULMAX2-RV64-NEXT:    add a1, a4, a1
+; LMULMAX2-RV64-NEXT:    srli a4, a1, 4
+; LMULMAX2-RV64-NEXT:    add a1, a1, a4
+; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    sd a1, 48(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
-; LMULMAX2-RV64-NEXT:    addi a3, a1, -1
+; LMULMAX2-RV64-NEXT:    addi a4, a1, -1
 ; LMULMAX2-RV64-NEXT:    not a1, a1
-; LMULMAX2-RV64-NEXT:    and a1, a1, a3
-; LMULMAX2-RV64-NEXT:    srli a3, a1, 1
-; LMULMAX2-RV64-NEXT:    and a3, a3, a6
-; LMULMAX2-RV64-NEXT:    sub a1, a1, a3
-; LMULMAX2-RV64-NEXT:    and a3, a1, a2
+; LMULMAX2-RV64-NEXT:    and a1, a1, a4
+; LMULMAX2-RV64-NEXT:    srli a4, a1, 1
+; LMULMAX2-RV64-NEXT:    and a4, a4, a6
+; LMULMAX2-RV64-NEXT:    sub a1, a1, a4
+; LMULMAX2-RV64-NEXT:    and a4, a1, a2
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 2
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a2
-; LMULMAX2-RV64-NEXT:    add a1, a3, a1
-; LMULMAX2-RV64-NEXT:    srli a3, a1, 4
-; LMULMAX2-RV64-NEXT:    add a1, a1, a3
-; LMULMAX2-RV64-NEXT:    and a1, a1, a4
+; LMULMAX2-RV64-NEXT:    add a1, a4, a1
+; LMULMAX2-RV64-NEXT:    srli a4, a1, 4
+; LMULMAX2-RV64-NEXT:    add a1, a1, a4
+; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    sd a1, 40(sp)
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
-; LMULMAX2-RV64-NEXT:    addi a3, a1, -1
+; LMULMAX2-RV64-NEXT:    addi a4, a1, -1
 ; LMULMAX2-RV64-NEXT:    not a1, a1
-; LMULMAX2-RV64-NEXT:    and a1, a1, a3
-; LMULMAX2-RV64-NEXT:    srli a3, a1, 1
-; LMULMAX2-RV64-NEXT:    and a3, a3, a6
-; LMULMAX2-RV64-NEXT:    sub a1, a1, a3
-; LMULMAX2-RV64-NEXT:    and a3, a1, a2
+; LMULMAX2-RV64-NEXT:    and a1, a1, a4
+; LMULMAX2-RV64-NEXT:    srli a4, a1, 1
+; LMULMAX2-RV64-NEXT:    and a4, a4, a6
+; LMULMAX2-RV64-NEXT:    sub a1, a1, a4
+; LMULMAX2-RV64-NEXT:    and a4, a1, a2
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 2
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a2
-; LMULMAX2-RV64-NEXT:    add a1, a3, a1
+; LMULMAX2-RV64-NEXT:    add a1, a4, a1
 ; LMULMAX2-RV64-NEXT:    srli a2, a1, 4
 ; LMULMAX2-RV64-NEXT:    add a1, a1, a2
-; LMULMAX2-RV64-NEXT:    and a1, a1, a4
+; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX2-RV64-NEXT:    sd a1, 32(sp)
@@ -8216,79 +8216,79 @@ define void @cttz_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a4, a4, a3
 ; LMULMAX1-RV64-NEXT:    add a4, a5, a4
 ; LMULMAX1-RV64-NEXT:    srli a5, a4, 4
-; LMULMAX1-RV64-NEXT:    add a4, a4, a5
-; LMULMAX1-RV64-NEXT:    lui a5, 3855
-; LMULMAX1-RV64-NEXT:    addiw a5, a5, 241
-; LMULMAX1-RV64-NEXT:    slli a5, a5, 12
-; LMULMAX1-RV64-NEXT:    addi a5, a5, -241
-; LMULMAX1-RV64-NEXT:    slli a5, a5, 12
-; LMULMAX1-RV64-NEXT:    addi a5, a5, 241
-; LMULMAX1-RV64-NEXT:    slli a5, a5, 12
-; LMULMAX1-RV64-NEXT:    addi a5, a5, -241
-; LMULMAX1-RV64-NEXT:    and a4, a4, a5
-; LMULMAX1-RV64-NEXT:    lui a1, 4112
-; LMULMAX1-RV64-NEXT:    addiw a1, a1, 257
-; LMULMAX1-RV64-NEXT:    slli a1, a1, 16
-; LMULMAX1-RV64-NEXT:    addi a1, a1, 257
-; LMULMAX1-RV64-NEXT:    slli a1, a1, 16
-; LMULMAX1-RV64-NEXT:    addi a1, a1, 257
-; LMULMAX1-RV64-NEXT:    mul a4, a4, a1
-; LMULMAX1-RV64-NEXT:    srli a4, a4, 56
+; LMULMAX1-RV64-NEXT:    add a5, a4, a5
+; LMULMAX1-RV64-NEXT:    lui a4, 3855
+; LMULMAX1-RV64-NEXT:    addiw a4, a4, 241
+; LMULMAX1-RV64-NEXT:    slli a4, a4, 12
+; LMULMAX1-RV64-NEXT:    addi a4, a4, -241
+; LMULMAX1-RV64-NEXT:    slli a4, a4, 12
+; LMULMAX1-RV64-NEXT:    addi a4, a4, 241
+; LMULMAX1-RV64-NEXT:    slli a4, a4, 12
+; LMULMAX1-RV64-NEXT:    addi a4, a4, -241
+; LMULMAX1-RV64-NEXT:    and a1, a5, a4
+; LMULMAX1-RV64-NEXT:    lui a5, 4112
+; LMULMAX1-RV64-NEXT:    addiw a5, a5, 257
+; LMULMAX1-RV64-NEXT:    slli a5, a5, 16
+; LMULMAX1-RV64-NEXT:    addi a5, a5, 257
+; LMULMAX1-RV64-NEXT:    slli a5, a5, 16
+; LMULMAX1-RV64-NEXT:    addi a5, a5, 257
+; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
+; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV64-NEXT:    vmv.v.x v27, a4
-; LMULMAX1-RV64-NEXT:    vmv.x.s a4, v26
-; LMULMAX1-RV64-NEXT:    addi a2, a4, -1
-; LMULMAX1-RV64-NEXT:    not a4, a4
-; LMULMAX1-RV64-NEXT:    and a2, a4, a2
-; LMULMAX1-RV64-NEXT:    srli a4, a2, 1
-; LMULMAX1-RV64-NEXT:    and a4, a4, a7
-; LMULMAX1-RV64-NEXT:    sub a2, a2, a4
-; LMULMAX1-RV64-NEXT:    and a4, a2, a3
-; LMULMAX1-RV64-NEXT:    srli a2, a2, 2
-; LMULMAX1-RV64-NEXT:    and a2, a2, a3
-; LMULMAX1-RV64-NEXT:    add a2, a4, a2
-; LMULMAX1-RV64-NEXT:    srli a4, a2, 4
-; LMULMAX1-RV64-NEXT:    add a2, a2, a4
-; LMULMAX1-RV64-NEXT:    and a2, a2, a5
-; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
-; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
+; LMULMAX1-RV64-NEXT:    vmv.v.x v27, a1
+; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
+; LMULMAX1-RV64-NEXT:    addi a2, a1, -1
+; LMULMAX1-RV64-NEXT:    not a1, a1
+; LMULMAX1-RV64-NEXT:    and a1, a1, a2
+; LMULMAX1-RV64-NEXT:    srli a2, a1, 1
+; LMULMAX1-RV64-NEXT:    and a2, a2, a7
+; LMULMAX1-RV64-NEXT:    sub a1, a1, a2
+; LMULMAX1-RV64-NEXT:    and a2, a1, a3
+; LMULMAX1-RV64-NEXT:    srli a1, a1, 2
+; LMULMAX1-RV64-NEXT:    and a1, a1, a3
+; LMULMAX1-RV64-NEXT:    add a1, a2, a1
+; LMULMAX1-RV64-NEXT:    srli a2, a1, 4
+; LMULMAX1-RV64-NEXT:    add a1, a1, a2
+; LMULMAX1-RV64-NEXT:    and a1, a1, a4
+; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
+; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    vsetvli zero, zero, e64, m1, tu, mu
-; LMULMAX1-RV64-NEXT:    vmv.s.x v27, a2
+; LMULMAX1-RV64-NEXT:    vmv.s.x v27, a1
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 1
-; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
-; LMULMAX1-RV64-NEXT:    addi a4, a2, -1
-; LMULMAX1-RV64-NEXT:    not a2, a2
-; LMULMAX1-RV64-NEXT:    and a2, a2, a4
-; LMULMAX1-RV64-NEXT:    srli a4, a2, 1
-; LMULMAX1-RV64-NEXT:    and a4, a4, a7
-; LMULMAX1-RV64-NEXT:    sub a2, a2, a4
-; LMULMAX1-RV64-NEXT:    and a4, a2, a3
-; LMULMAX1-RV64-NEXT:    srli a2, a2, 2
-; LMULMAX1-RV64-NEXT:    and a2, a2, a3
-; LMULMAX1-RV64-NEXT:    add a2, a4, a2
-; LMULMAX1-RV64-NEXT:    srli a4, a2, 4
-; LMULMAX1-RV64-NEXT:    add a2, a2, a4
-; LMULMAX1-RV64-NEXT:    and a2, a2, a5
-; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
-; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
+; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
+; LMULMAX1-RV64-NEXT:    addi a2, a1, -1
+; LMULMAX1-RV64-NEXT:    not a1, a1
+; LMULMAX1-RV64-NEXT:    and a1, a1, a2
+; LMULMAX1-RV64-NEXT:    srli a2, a1, 1
+; LMULMAX1-RV64-NEXT:    and a2, a2, a7
+; LMULMAX1-RV64-NEXT:    sub a1, a1, a2
+; LMULMAX1-RV64-NEXT:    and a2, a1, a3
+; LMULMAX1-RV64-NEXT:    srli a1, a1, 2
+; LMULMAX1-RV64-NEXT:    and a1, a1, a3
+; LMULMAX1-RV64-NEXT:    add a1, a2, a1
+; LMULMAX1-RV64-NEXT:    srli a2, a1, 4
+; LMULMAX1-RV64-NEXT:    add a1, a1, a2
+; LMULMAX1-RV64-NEXT:    and a1, a1, a4
+; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
+; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; LMULMAX1-RV64-NEXT:    vmv.v.x v26, a2
-; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v25
-; LMULMAX1-RV64-NEXT:    addi a4, a2, -1
-; LMULMAX1-RV64-NEXT:    not a2, a2
-; LMULMAX1-RV64-NEXT:    and a2, a2, a4
-; LMULMAX1-RV64-NEXT:    srli a4, a2, 1
-; LMULMAX1-RV64-NEXT:    and a4, a4, a7
-; LMULMAX1-RV64-NEXT:    sub a2, a2, a4
-; LMULMAX1-RV64-NEXT:    and a4, a2, a3
-; LMULMAX1-RV64-NEXT:    srli a2, a2, 2
-; LMULMAX1-RV64-NEXT:    and a2, a2, a3
-; LMULMAX1-RV64-NEXT:    add a2, a4, a2
-; LMULMAX1-RV64-NEXT:    srli a3, a2, 4
-; LMULMAX1-RV64-NEXT:    add a2, a2, a3
-; LMULMAX1-RV64-NEXT:    and a2, a2, a5
-; LMULMAX1-RV64-NEXT:    mul a1, a2, a1
+; LMULMAX1-RV64-NEXT:    vmv.v.x v26, a1
+; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
+; LMULMAX1-RV64-NEXT:    addi a2, a1, -1
+; LMULMAX1-RV64-NEXT:    not a1, a1
+; LMULMAX1-RV64-NEXT:    and a1, a1, a2
+; LMULMAX1-RV64-NEXT:    srli a2, a1, 1
+; LMULMAX1-RV64-NEXT:    and a2, a2, a7
+; LMULMAX1-RV64-NEXT:    sub a1, a1, a2
+; LMULMAX1-RV64-NEXT:    and a2, a1, a3
+; LMULMAX1-RV64-NEXT:    srli a1, a1, 2
+; LMULMAX1-RV64-NEXT:    and a1, a1, a3
+; LMULMAX1-RV64-NEXT:    add a1, a2, a1
+; LMULMAX1-RV64-NEXT:    srli a2, a1, 4
+; LMULMAX1-RV64-NEXT:    add a1, a1, a2
+; LMULMAX1-RV64-NEXT:    and a1, a1, a4
+; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
 ; LMULMAX1-RV64-NEXT:    vsetvli zero, zero, e64, m1, tu, mu
 ; LMULMAX1-RV64-NEXT:    vmv.s.x v26, a1

diff  --git a/llvm/test/CodeGen/RISCV/stack-store-check.ll b/llvm/test/CodeGen/RISCV/stack-store-check.ll
index d9770a94e9f09..7b2e3bc4846e8 100644
--- a/llvm/test/CodeGen/RISCV/stack-store-check.ll
+++ b/llvm/test/CodeGen/RISCV/stack-store-check.ll
@@ -32,12 +32,12 @@ define void @main() local_unnamed_addr nounwind {
 ; CHECK-NEXT:    lw s6, %lo(U)(a0)
 ; CHECK-NEXT:    lw s7, %lo(U+4)(a0)
 ; CHECK-NEXT:    lw s8, %lo(U+8)(a0)
-; CHECK-NEXT:    lw s0, %lo(U+12)(a0)
+; CHECK-NEXT:    lw s2, %lo(U+12)(a0)
 ; CHECK-NEXT:    sw zero, 612(sp)
 ; CHECK-NEXT:    sw zero, 608(sp)
 ; CHECK-NEXT:    sw zero, 604(sp)
 ; CHECK-NEXT:    sw zero, 600(sp)
-; CHECK-NEXT:    sw s0, 596(sp)
+; CHECK-NEXT:    sw s2, 596(sp)
 ; CHECK-NEXT:    sw s8, 592(sp)
 ; CHECK-NEXT:    sw s7, 588(sp)
 ; CHECK-NEXT:    addi a0, sp, 616
@@ -45,21 +45,21 @@ define void @main() local_unnamed_addr nounwind {
 ; CHECK-NEXT:    addi a2, sp, 584
 ; CHECK-NEXT:    sw s6, 584(sp)
 ; CHECK-NEXT:    call __subtf3 at plt
-; CHECK-NEXT:    lw s3, 616(sp)
-; CHECK-NEXT:    lw s4, 620(sp)
-; CHECK-NEXT:    lw s9, 624(sp)
+; CHECK-NEXT:    lw s4, 616(sp)
+; CHECK-NEXT:    lw s5, 620(sp)
+; CHECK-NEXT:    lw s3, 624(sp)
 ; CHECK-NEXT:    lw s11, 628(sp)
-; CHECK-NEXT:    sw s0, 548(sp)
+; CHECK-NEXT:    sw s2, 548(sp)
 ; CHECK-NEXT:    sw s8, 544(sp)
 ; CHECK-NEXT:    sw s7, 540(sp)
 ; CHECK-NEXT:    sw s6, 536(sp)
 ; CHECK-NEXT:    sw s11, 564(sp)
-; CHECK-NEXT:    sw s9, 560(sp)
-; CHECK-NEXT:    sw s4, 556(sp)
+; CHECK-NEXT:    sw s3, 560(sp)
+; CHECK-NEXT:    sw s5, 556(sp)
 ; CHECK-NEXT:    addi a0, sp, 568
 ; CHECK-NEXT:    addi a1, sp, 552
 ; CHECK-NEXT:    addi a2, sp, 536
-; CHECK-NEXT:    sw s3, 552(sp)
+; CHECK-NEXT:    sw s4, 552(sp)
 ; CHECK-NEXT:    call __subtf3 at plt
 ; CHECK-NEXT:    lw a0, 568(sp)
 ; CHECK-NEXT:    sw a0, 40(sp) # 4-byte Folded Spill
@@ -68,12 +68,12 @@ define void @main() local_unnamed_addr nounwind {
 ; CHECK-NEXT:    lw a0, 576(sp)
 ; CHECK-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
 ; CHECK-NEXT:    lw a0, 580(sp)
-; CHECK-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
+; CHECK-NEXT:    sw a0, 48(sp) # 4-byte Folded Spill
 ; CHECK-NEXT:    sw zero, 500(sp)
 ; CHECK-NEXT:    sw zero, 496(sp)
 ; CHECK-NEXT:    sw zero, 492(sp)
 ; CHECK-NEXT:    sw zero, 488(sp)
-; CHECK-NEXT:    sw s0, 516(sp)
+; CHECK-NEXT:    sw s2, 516(sp)
 ; CHECK-NEXT:    sw s8, 512(sp)
 ; CHECK-NEXT:    sw s7, 508(sp)
 ; CHECK-NEXT:    addi a0, sp, 520
@@ -81,31 +81,32 @@ define void @main() local_unnamed_addr nounwind {
 ; CHECK-NEXT:    addi a2, sp, 488
 ; CHECK-NEXT:    sw s6, 504(sp)
 ; CHECK-NEXT:    call __addtf3 at plt
-; CHECK-NEXT:    lw s2, 520(sp)
+; CHECK-NEXT:    lw s9, 520(sp)
 ; CHECK-NEXT:    lw s10, 524(sp)
-; CHECK-NEXT:    lw s5, 528(sp)
+; CHECK-NEXT:    lw s0, 528(sp)
+; CHECK-NEXT:    sw s0, 20(sp) # 4-byte Folded Spill
 ; CHECK-NEXT:    lw s1, 532(sp)
-; CHECK-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
+; CHECK-NEXT:    sw s1, 16(sp) # 4-byte Folded Spill
 ; CHECK-NEXT:    lui a0, %hi(Y1)
 ; CHECK-NEXT:    lw a1, %lo(Y1)(a0)
-; CHECK-NEXT:    sw a1, 48(sp) # 4-byte Folded Spill
+; CHECK-NEXT:    sw a1, 52(sp) # 4-byte Folded Spill
 ; CHECK-NEXT:    lw a2, %lo(Y1+4)(a0)
-; CHECK-NEXT:    sw a2, 52(sp) # 4-byte Folded Spill
+; CHECK-NEXT:    sw a2, 12(sp) # 4-byte Folded Spill
 ; CHECK-NEXT:    lw a3, %lo(Y1+8)(a0)
-; CHECK-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; CHECK-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
 ; CHECK-NEXT:    lw a0, %lo(Y1+12)(a0)
-; CHECK-NEXT:    sw a0, 0(sp) # 4-byte Folded Spill
+; CHECK-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
 ; CHECK-NEXT:    sw a0, 308(sp)
 ; CHECK-NEXT:    sw a3, 304(sp)
 ; CHECK-NEXT:    sw a2, 300(sp)
 ; CHECK-NEXT:    sw a1, 296(sp)
 ; CHECK-NEXT:    sw s11, 324(sp)
-; CHECK-NEXT:    sw s9, 320(sp)
-; CHECK-NEXT:    sw s4, 316(sp)
+; CHECK-NEXT:    sw s3, 320(sp)
+; CHECK-NEXT:    sw s5, 316(sp)
 ; CHECK-NEXT:    addi a0, sp, 328
 ; CHECK-NEXT:    addi a1, sp, 312
 ; CHECK-NEXT:    addi a2, sp, 296
-; CHECK-NEXT:    sw s3, 312(sp)
+; CHECK-NEXT:    sw s4, 312(sp)
 ; CHECK-NEXT:    call __multf3 at plt
 ; CHECK-NEXT:    lw a0, 328(sp)
 ; CHECK-NEXT:    sw a0, 44(sp) # 4-byte Folded Spill
@@ -113,19 +114,18 @@ define void @main() local_unnamed_addr nounwind {
 ; CHECK-NEXT:    sw a0, 36(sp) # 4-byte Folded Spill
 ; CHECK-NEXT:    lw a0, 336(sp)
 ; CHECK-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; CHECK-NEXT:    lw a0, 340(sp)
-; CHECK-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; CHECK-NEXT:    sw s0, 468(sp)
+; CHECK-NEXT:    lw s4, 340(sp)
+; CHECK-NEXT:    sw s2, 468(sp)
 ; CHECK-NEXT:    sw s8, 464(sp)
 ; CHECK-NEXT:    sw s7, 460(sp)
 ; CHECK-NEXT:    sw s6, 456(sp)
 ; CHECK-NEXT:    sw s1, 452(sp)
-; CHECK-NEXT:    sw s5, 448(sp)
+; CHECK-NEXT:    sw s0, 448(sp)
 ; CHECK-NEXT:    sw s10, 444(sp)
 ; CHECK-NEXT:    addi a0, sp, 472
 ; CHECK-NEXT:    addi a1, sp, 456
 ; CHECK-NEXT:    addi a2, sp, 440
-; CHECK-NEXT:    sw s2, 440(sp)
+; CHECK-NEXT:    sw s9, 440(sp)
 ; CHECK-NEXT:    call __addtf3 at plt
 ; CHECK-NEXT:    lw a3, 472(sp)
 ; CHECK-NEXT:    lw a0, 476(sp)
@@ -152,43 +152,44 @@ define void @main() local_unnamed_addr nounwind {
 ; CHECK-NEXT:    sw a2, %lo(X+8)(a4)
 ; CHECK-NEXT:    sw a3, %lo(X+4)(a4)
 ; CHECK-NEXT:    sw a0, %lo(X)(a4)
-; CHECK-NEXT:    lw s8, 0(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    lw s8, 4(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    sw s8, 212(sp)
-; CHECK-NEXT:    lw s7, 4(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    lw s7, 8(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    sw s7, 208(sp)
+; CHECK-NEXT:    lw s11, 12(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw s11, 204(sp)
 ; CHECK-NEXT:    lw a0, 52(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw a0, 204(sp)
-; CHECK-NEXT:    lw a0, 48(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    sw a0, 200(sp)
-; CHECK-NEXT:    lw s6, 16(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw s6, 228(sp)
-; CHECK-NEXT:    lw s4, 24(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw s4, 224(sp)
-; CHECK-NEXT:    lw s0, 32(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw s0, 220(sp)
+; CHECK-NEXT:    lw a0, 48(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw a0, 228(sp)
+; CHECK-NEXT:    lw s3, 24(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw s3, 224(sp)
+; CHECK-NEXT:    lw s2, 32(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw s2, 220(sp)
 ; CHECK-NEXT:    addi a0, sp, 232
 ; CHECK-NEXT:    addi a1, sp, 216
 ; CHECK-NEXT:    addi a2, sp, 200
 ; CHECK-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    sw s1, 216(sp)
 ; CHECK-NEXT:    call __multf3 at plt
-; CHECK-NEXT:    lw a0, 232(sp)
-; CHECK-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; CHECK-NEXT:    lw s3, 236(sp)
-; CHECK-NEXT:    lw s9, 240(sp)
-; CHECK-NEXT:    lw s11, 244(sp)
+; CHECK-NEXT:    lw s5, 232(sp)
+; CHECK-NEXT:    lw a0, 236(sp)
+; CHECK-NEXT:    sw a0, 0(sp) # 4-byte Folded Spill
+; CHECK-NEXT:    lw s6, 240(sp)
+; CHECK-NEXT:    lw s0, 244(sp)
 ; CHECK-NEXT:    sw zero, 356(sp)
 ; CHECK-NEXT:    sw zero, 352(sp)
 ; CHECK-NEXT:    sw zero, 348(sp)
 ; CHECK-NEXT:    sw zero, 344(sp)
-; CHECK-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    sw a0, 372(sp)
-; CHECK-NEXT:    sw s5, 368(sp)
+; CHECK-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw a0, 368(sp)
 ; CHECK-NEXT:    sw s10, 364(sp)
 ; CHECK-NEXT:    addi a0, sp, 376
 ; CHECK-NEXT:    addi a1, sp, 360
 ; CHECK-NEXT:    addi a2, sp, 344
-; CHECK-NEXT:    sw s2, 360(sp)
+; CHECK-NEXT:    sw s9, 360(sp)
 ; CHECK-NEXT:    call __multf3 at plt
 ; CHECK-NEXT:    lw a0, 376(sp)
 ; CHECK-NEXT:    lw a1, 388(sp)
@@ -199,12 +200,12 @@ define void @main() local_unnamed_addr nounwind {
 ; CHECK-NEXT:    sw a2, %lo(S+8)(a4)
 ; CHECK-NEXT:    sw a3, %lo(S+4)(a4)
 ; CHECK-NEXT:    sw a0, %lo(S)(a4)
-; CHECK-NEXT:    sw s6, 260(sp)
-; CHECK-NEXT:    sw s4, 256(sp)
-; CHECK-NEXT:    sw s0, 252(sp)
+; CHECK-NEXT:    lw a0, 48(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw a0, 260(sp)
+; CHECK-NEXT:    sw s3, 256(sp)
+; CHECK-NEXT:    sw s2, 252(sp)
 ; CHECK-NEXT:    sw s1, 248(sp)
-; CHECK-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw a0, 276(sp)
+; CHECK-NEXT:    sw s4, 276(sp)
 ; CHECK-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    sw a0, 272(sp)
 ; CHECK-NEXT:    lw a0, 36(sp) # 4-byte Folded Reload
@@ -228,14 +229,14 @@ define void @main() local_unnamed_addr nounwind {
 ; CHECK-NEXT:    sw zero, 160(sp)
 ; CHECK-NEXT:    sw zero, 156(sp)
 ; CHECK-NEXT:    sw zero, 152(sp)
-; CHECK-NEXT:    sw s11, 180(sp)
-; CHECK-NEXT:    sw s9, 176(sp)
-; CHECK-NEXT:    sw s3, 172(sp)
+; CHECK-NEXT:    sw s0, 180(sp)
+; CHECK-NEXT:    sw s6, 176(sp)
+; CHECK-NEXT:    lw a0, 0(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw a0, 172(sp)
 ; CHECK-NEXT:    addi a0, sp, 184
 ; CHECK-NEXT:    addi a1, sp, 168
 ; CHECK-NEXT:    addi a2, sp, 152
-; CHECK-NEXT:    lw a3, 12(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw a3, 168(sp)
+; CHECK-NEXT:    sw s5, 168(sp)
 ; CHECK-NEXT:    call __addtf3 at plt
 ; CHECK-NEXT:    lw a0, 184(sp)
 ; CHECK-NEXT:    lw a1, 196(sp)
@@ -252,12 +253,11 @@ define void @main() local_unnamed_addr nounwind {
 ; CHECK-NEXT:    sw zero, 104(sp)
 ; CHECK-NEXT:    sw s8, 132(sp)
 ; CHECK-NEXT:    sw s7, 128(sp)
-; CHECK-NEXT:    lw a0, 52(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw a0, 124(sp)
+; CHECK-NEXT:    sw s11, 124(sp)
 ; CHECK-NEXT:    addi a0, sp, 136
 ; CHECK-NEXT:    addi a1, sp, 120
 ; CHECK-NEXT:    addi a2, sp, 104
-; CHECK-NEXT:    lw a3, 48(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    lw a3, 52(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    sw a3, 120(sp)
 ; CHECK-NEXT:    call __multf3 at plt
 ; CHECK-NEXT:    lw a3, 136(sp)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
index 7e9904a735237..bc1a3964a54e5 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
@@ -1028,12 +1028,12 @@ define arm_aapcs_vfpcc <6 x i32> @test_signed_v6f64_v6i32(<6 x double> %f) {
 ; CHECK-NEXT:    vldr d0, .LCPI13_0
 ; CHECK-NEXT:    vmov r9, r4, d5
 ; CHECK-NEXT:    vmov r2, r6, d0
-; CHECK-NEXT:    vmov.f32 s20, s8
-; CHECK-NEXT:    vmov.f32 s22, s6
+; CHECK-NEXT:    vmov.f32 s22, s8
+; CHECK-NEXT:    vmov.f32 s20, s6
 ; CHECK-NEXT:    vmov.f32 s18, s4
 ; CHECK-NEXT:    vmov.f32 s24, s2
-; CHECK-NEXT:    vmov.f32 s21, s9
-; CHECK-NEXT:    vmov.f32 s23, s7
+; CHECK-NEXT:    vmov.f32 s23, s9
+; CHECK-NEXT:    vmov.f32 s21, s7
 ; CHECK-NEXT:    vmov.f32 s19, s5
 ; CHECK-NEXT:    vmov.f32 s25, s3
 ; CHECK-NEXT:    str r2, [sp, #24] @ 4-byte Spill
@@ -1054,11 +1054,11 @@ define arm_aapcs_vfpcc <6 x i32> @test_signed_v6f64_v6i32(<6 x double> %f) {
 ; CHECK-NEXT:    mov r1, r4
 ; CHECK-NEXT:    bl __aeabi_d2lz
 ; CHECK-NEXT:    mov r10, r0
-; CHECK-NEXT:    vmov r8, r0, d11
+; CHECK-NEXT:    vmov r8, r0, d10
 ; CHECK-NEXT:    cmp.w r11, #0
 ; CHECK-NEXT:    mov r2, r9
 ; CHECK-NEXT:    mov r3, r4
-; CHECK-NEXT:    vmov r7, r5, d10
+; CHECK-NEXT:    vmov r7, r5, d11
 ; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    vmov r1, r0, d12
 ; CHECK-NEXT:    strd r1, r0, [sp, #12] @ 8-byte Folded Spill

diff  --git a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
index 9c8176925870d..aa3b2d4f83bca 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
@@ -691,11 +691,11 @@ define arm_aapcs_vfpcc <5 x i32> @test_unsigned_v5f64_v5i32(<5 x double> %f) {
 ; CHECK-NEXT:    vmov r5, r6, d4
 ; CHECK-NEXT:    str r0, [sp, #28] @ 4-byte Spill
 ; CHECK-NEXT:    vmov r2, r3, d0
-; CHECK-NEXT:    vmov.f32 s18, s6
-; CHECK-NEXT:    vmov.f32 s20, s4
+; CHECK-NEXT:    vmov.f32 s20, s6
+; CHECK-NEXT:    vmov.f32 s18, s4
 ; CHECK-NEXT:    vmov.f32 s22, s2
-; CHECK-NEXT:    vmov.f32 s19, s7
-; CHECK-NEXT:    vmov.f32 s21, s5
+; CHECK-NEXT:    vmov.f32 s21, s7
+; CHECK-NEXT:    vmov.f32 s19, s5
 ; CHECK-NEXT:    vmov.f32 s23, s3
 ; CHECK-NEXT:    mov r0, r5
 ; CHECK-NEXT:    mov r1, r6
@@ -716,11 +716,11 @@ define arm_aapcs_vfpcc <5 x i32> @test_unsigned_v5f64_v5i32(<5 x double> %f) {
 ; CHECK-NEXT:    bl __aeabi_d2ulz
 ; CHECK-NEXT:    vmov r8, r1, d11
 ; CHECK-NEXT:    cmp.w r11, #0
-; CHECK-NEXT:    vmov r6, r9, d9
+; CHECK-NEXT:    vmov r6, r9, d10
 ; CHECK-NEXT:    csel r0, r0, r11, ne
 ; CHECK-NEXT:    cmp.w r10, #0
 ; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    vmov r2, r1, d10
+; CHECK-NEXT:    vmov r2, r1, d9
 ; CHECK-NEXT:    strd r2, r1, [sp, #16] @ 8-byte Folded Spill
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r0, #-1
@@ -859,13 +859,13 @@ define arm_aapcs_vfpcc <6 x i32> @test_unsigned_v6f64_v6i32(<6 x double> %f) {
 ; CHECK-NEXT:    vldr d0, .LCPI13_0
 ; CHECK-NEXT:    vmov r5, r6, d5
 ; CHECK-NEXT:    vmov r11, r3, d0
-; CHECK-NEXT:    vmov.f32 s18, s8
+; CHECK-NEXT:    vmov.f32 s22, s8
 ; CHECK-NEXT:    vmov.f32 s20, s6
-; CHECK-NEXT:    vmov.f32 s22, s4
+; CHECK-NEXT:    vmov.f32 s18, s4
 ; CHECK-NEXT:    vmov.f32 s24, s2
-; CHECK-NEXT:    vmov.f32 s19, s9
+; CHECK-NEXT:    vmov.f32 s23, s9
 ; CHECK-NEXT:    vmov.f32 s21, s7
-; CHECK-NEXT:    vmov.f32 s23, s5
+; CHECK-NEXT:    vmov.f32 s19, s5
 ; CHECK-NEXT:    vmov.f32 s25, s3
 ; CHECK-NEXT:    str r3, [sp, #36] @ 4-byte Spill
 ; CHECK-NEXT:    mov r0, r5
@@ -888,7 +888,7 @@ define arm_aapcs_vfpcc <6 x i32> @test_unsigned_v6f64_v6i32(<6 x double> %f) {
 ; CHECK-NEXT:    bl __aeabi_d2ulz
 ; CHECK-NEXT:    vmov r10, r1, d10
 ; CHECK-NEXT:    cmp.w r8, #0
-; CHECK-NEXT:    vmov r5, r6, d9
+; CHECK-NEXT:    vmov r5, r6, d11
 ; CHECK-NEXT:    csel r0, r0, r8, ne
 ; CHECK-NEXT:    cmp r7, #0
 ; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
@@ -914,7 +914,7 @@ define arm_aapcs_vfpcc <6 x i32> @test_unsigned_v6f64_v6i32(<6 x double> %f) {
 ; CHECK-NEXT:    mov r0, r5
 ; CHECK-NEXT:    mov r1, r6
 ; CHECK-NEXT:    bl __aeabi_d2ulz
-; CHECK-NEXT:    vmov r2, r1, d11
+; CHECK-NEXT:    vmov r2, r1, d9
 ; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    csel r0, r0, r4, ne
 ; CHECK-NEXT:    cmp.w r11, #0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
index d145b6a61737b..b3f7b7d961ad0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
@@ -112,13 +112,13 @@ define arm_aapcs_vfpcc <2 x double> @add_float64_t(<2 x double> %src1, <2 x doub
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vmov q4, q1
-; CHECK-NEXT:    vmov q5, q0
-; CHECK-NEXT:    vmov r0, r1, d9
-; CHECK-NEXT:    vmov r2, r3, d11
+; CHECK-NEXT:    vmov q5, q1
+; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    vmov r0, r1, d11
+; CHECK-NEXT:    vmov r2, r3, d9
 ; CHECK-NEXT:    bl __aeabi_dadd
-; CHECK-NEXT:    vmov lr, r12, d8
-; CHECK-NEXT:    vmov r2, r3, d10
+; CHECK-NEXT:    vmov lr, r12, d10
+; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
 ; CHECK-NEXT:    mov r0, lr
 ; CHECK-NEXT:    mov r1, r12
@@ -243,13 +243,13 @@ define arm_aapcs_vfpcc <2 x double> @sub_float64_t(<2 x double> %src1, <2 x doub
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vmov q4, q1
-; CHECK-NEXT:    vmov q5, q0
-; CHECK-NEXT:    vmov r0, r1, d9
-; CHECK-NEXT:    vmov r2, r3, d11
+; CHECK-NEXT:    vmov q5, q1
+; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    vmov r0, r1, d11
+; CHECK-NEXT:    vmov r2, r3, d9
 ; CHECK-NEXT:    bl __aeabi_dsub
-; CHECK-NEXT:    vmov lr, r12, d8
-; CHECK-NEXT:    vmov r2, r3, d10
+; CHECK-NEXT:    vmov lr, r12, d10
+; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
 ; CHECK-NEXT:    mov r0, lr
 ; CHECK-NEXT:    mov r1, r12
@@ -376,13 +376,13 @@ define arm_aapcs_vfpcc <2 x double> @mul_float64_t(<2 x double> %src1, <2 x doub
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vmov q4, q1
-; CHECK-NEXT:    vmov q5, q0
-; CHECK-NEXT:    vmov r0, r1, d9
-; CHECK-NEXT:    vmov r2, r3, d11
+; CHECK-NEXT:    vmov q5, q1
+; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    vmov r0, r1, d11
+; CHECK-NEXT:    vmov r2, r3, d9
 ; CHECK-NEXT:    bl __aeabi_dmul
-; CHECK-NEXT:    vmov lr, r12, d8
-; CHECK-NEXT:    vmov r2, r3, d10
+; CHECK-NEXT:    vmov lr, r12, d10
+; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
 ; CHECK-NEXT:    mov r0, lr
 ; CHECK-NEXT:    mov r1, r12

diff  --git a/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll
index a0c3125cd7f8b..855ea9492f525 100644
--- a/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll
@@ -68,39 +68,39 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    and r0, r3, #1
-; CHECK-NEXT:    mov r4, r1
+; CHECK-NEXT:    mov r5, r1
 ; CHECK-NEXT:    rsbs r1, r0, #0
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    movs r2, #9
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    bl __aeabi_ldivmod
-; CHECK-NEXT:    and r0, r4, #1
-; CHECK-NEXT:    mov r6, r2
+; CHECK-NEXT:    and r0, r5, #1
+; CHECK-NEXT:    mov r7, r2
 ; CHECK-NEXT:    rsbs r1, r0, #0
-; CHECK-NEXT:    mov r7, r3
-; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r4, r3
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    movs r2, #9
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    bl __aeabi_ldivmod
 ; CHECK-NEXT:    ldr r1, [sp, #44]
 ; CHECK-NEXT:    vmov.32 d8[0], r2
 ; CHECK-NEXT:    ldr r0, [sp, #40]
-; CHECK-NEXT:    mov r4, r3
+; CHECK-NEXT:    mov r5, r3
 ; CHECK-NEXT:    and r1, r1, #1
 ; CHECK-NEXT:    mvn r2, #8
 ; CHECK-NEXT:    rsbs r1, r1, #0
 ; CHECK-NEXT:    mov.w r3, #-1
-; CHECK-NEXT:    vmov.32 d9[0], r6
+; CHECK-NEXT:    vmov.32 d9[0], r7
 ; CHECK-NEXT:    bl __aeabi_ldivmod
 ; CHECK-NEXT:    vmov.32 d16[0], r2
 ; CHECK-NEXT:    adr r0, .LCPI3_0
-; CHECK-NEXT:    vmov.32 d9[1], r7
+; CHECK-NEXT:    vmov.32 d9[1], r4
 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r0:128]
 ; CHECK-NEXT:    adr r0, .LCPI3_1
 ; CHECK-NEXT:    vmov.32 d16[1], r3
-; CHECK-NEXT:    vmov.32 d8[1], r4
+; CHECK-NEXT:    vmov.32 d8[1], r5
 ; CHECK-NEXT:    vand q8, q8, q9
 ; CHECK-NEXT:    vld1.64 {d20, d21}, [r0:128]
 ; CHECK-NEXT:    adr r0, .LCPI3_2

diff  --git a/llvm/test/CodeGen/X86/2007-10-12-SpillerUnfold1.ll b/llvm/test/CodeGen/X86/2007-10-12-SpillerUnfold1.ll
index 2cd09606e843a..3f89964ab03ba 100644
--- a/llvm/test/CodeGen/X86/2007-10-12-SpillerUnfold1.ll
+++ b/llvm/test/CodeGen/X86/2007-10-12-SpillerUnfold1.ll
@@ -4,32 +4,32 @@
 define fastcc void @fht(float* %fz, i16 signext  %n) {
 ; CHECK-LABEL: fht:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
-; CHECK-NEXT:    xorps %xmm2, %xmm2
-; CHECK-NEXT:    subss %xmm1, %xmm2
-; CHECK-NEXT:    movaps %xmm1, %xmm3
-; CHECK-NEXT:    mulss %xmm0, %xmm3
-; CHECK-NEXT:    addss %xmm1, %xmm3
-; CHECK-NEXT:    movaps %xmm1, %xmm4
-; CHECK-NEXT:    subss %xmm3, %xmm4
-; CHECK-NEXT:    addss %xmm1, %xmm3
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    subss %xmm3, %xmm1
+; CHECK-NEXT:    movaps %xmm3, %xmm4
+; CHECK-NEXT:    mulss %xmm0, %xmm4
+; CHECK-NEXT:    addss %xmm3, %xmm4
+; CHECK-NEXT:    movaps %xmm3, %xmm2
+; CHECK-NEXT:    subss %xmm4, %xmm2
+; CHECK-NEXT:    addss %xmm3, %xmm4
 ; CHECK-NEXT:    xorps %xmm5, %xmm5
-; CHECK-NEXT:    subss %xmm2, %xmm5
-; CHECK-NEXT:    addss %xmm0, %xmm2
-; CHECK-NEXT:    mulss %xmm0, %xmm3
+; CHECK-NEXT:    subss %xmm1, %xmm5
+; CHECK-NEXT:    addss %xmm0, %xmm1
+; CHECK-NEXT:    mulss %xmm0, %xmm4
 ; CHECK-NEXT:    mulss %xmm0, %xmm5
-; CHECK-NEXT:    addss %xmm3, %xmm5
+; CHECK-NEXT:    addss %xmm4, %xmm5
 ; CHECK-NEXT:    addss %xmm0, %xmm5
 ; CHECK-NEXT:    movss %xmm5, 0
-; CHECK-NEXT:    movss %xmm1, (%ecx)
-; CHECK-NEXT:    addss %xmm0, %xmm1
-; CHECK-NEXT:    movss %xmm1, 0
+; CHECK-NEXT:    movss %xmm3, (%ecx)
+; CHECK-NEXT:    addss %xmm0, %xmm3
+; CHECK-NEXT:    movss %xmm3, 0
+; CHECK-NEXT:    mulss %xmm0, %xmm1
 ; CHECK-NEXT:    mulss %xmm0, %xmm2
-; CHECK-NEXT:    mulss %xmm0, %xmm4
-; CHECK-NEXT:    addss %xmm2, %xmm4
-; CHECK-NEXT:    addss %xmm0, %xmm4
-; CHECK-NEXT:    movss %xmm4, (%ecx)
+; CHECK-NEXT:    addss %xmm1, %xmm2
+; CHECK-NEXT:    addss %xmm0, %xmm2
+; CHECK-NEXT:    movss %xmm2, (%ecx)
 ; CHECK-NEXT:    retl
 entry:
 	br i1 true, label %bb171.preheader, label %bb431

diff  --git a/llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll b/llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll
index 7be73f638734b..8002d621cfd5e 100644
--- a/llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll
+++ b/llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll
@@ -29,20 +29,20 @@ define i16 @SQLDriversW(i8* %henv, i16 zeroext  %fDir, i32* %szDrvDesc, i16 sign
 ; CHECK-NEXT:    movw $0, 40(%edi)
 ; CHECK-NEXT:    movb $1, %al
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    leal (,%ecx,4), %eax
-; CHECK-NEXT:    leal (,%ebx,4), %ecx
+; CHECK-NEXT:    leal (,%ecx,4), %ecx
+; CHECK-NEXT:    leal (,%ebx,4), %edx
 ; CHECK-NEXT:    subl $12, %esp
-; CHECK-NEXT:    movzwl %bp, %edx
-; CHECK-NEXT:    cwtl
+; CHECK-NEXT:    movzwl %bp, %eax
 ; CHECK-NEXT:    movswl %cx, %ecx
+; CHECK-NEXT:    movswl %dx, %edx
 ; CHECK-NEXT:    pushl $87
 ; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-NEXT:    pushl %eax
-; CHECK-NEXT:    pushl $0
-; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    pushl $0
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    pushl %edx
+; CHECK-NEXT:    pushl $0
+; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    calll _SQLDrivers_Internal
 ; CHECK-NEXT:    addl $48, %esp

diff  --git a/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll b/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll
index a8e433eac1320..6f8661f8a2551 100644
--- a/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll
+++ b/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll
@@ -337,20 +337,20 @@ define i64 @t5_cse(i64 %val, i64 %shamt, i64*%dst) nounwind {
 ; X32-BMI2-NEXT:    pushl %ebx
 ; X32-BMI2-NEXT:    pushl %edi
 ; X32-BMI2-NEXT:    pushl %esi
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-BMI2-NEXT:    movl %eax, %ebx
-; X32-BMI2-NEXT:    addl $32, %ebx
-; X32-BMI2-NEXT:    adcl $0, %edi
-; X32-BMI2-NEXT:    movl %ebx, (%ecx)
-; X32-BMI2-NEXT:    movl %edi, 4(%ecx)
+; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-BMI2-NEXT:    movl %ebx, %edi
+; X32-BMI2-NEXT:    addl $32, %edi
+; X32-BMI2-NEXT:    adcl $0, %esi
+; X32-BMI2-NEXT:    movl %edi, (%ecx)
+; X32-BMI2-NEXT:    movl %esi, 4(%ecx)
 ; X32-BMI2-NEXT:    movb $32, %cl
-; X32-BMI2-NEXT:    subb %al, %cl
-; X32-BMI2-NEXT:    shldl %cl, %esi, %edx
-; X32-BMI2-NEXT:    shlxl %ecx, %esi, %eax
+; X32-BMI2-NEXT:    subb %bl, %cl
+; X32-BMI2-NEXT:    shldl %cl, %eax, %edx
+; X32-BMI2-NEXT:    shlxl %ecx, %eax, %eax
 ; X32-BMI2-NEXT:    testb $32, %cl
 ; X32-BMI2-NEXT:    je .LBB5_2
 ; X32-BMI2-NEXT:  # %bb.1:

diff  --git a/llvm/test/CodeGen/X86/abs.ll b/llvm/test/CodeGen/X86/abs.ll
index 0bbc785034a21..001fc3b78ac16 100644
--- a/llvm/test/CodeGen/X86/abs.ll
+++ b/llvm/test/CodeGen/X86/abs.ll
@@ -278,25 +278,25 @@ define <4 x i32> @test_v4i32(<4 x i32> %a) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    cmovll %ebx, %edx
 ; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    negl %ebx
 ; X86-NEXT:    cmovll %edi, %ebx
 ; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    negl %edi
 ; X86-NEXT:    cmovll %esi, %edi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    negl %esi
-; X86-NEXT:    cmovll %edx, %esi
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    cmovll %ecx, %edx
-; X86-NEXT:    movl %edx, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    cmovll %ecx, %esi
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -339,31 +339,31 @@ define <8 x i32> @test_v8i32(<8 x i32> %a) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    negl %ecx
 ; X86-NEXT:    cmovll %edx, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    negl %ecx
-; X86-NEXT:    cmovll %ebp, %ecx
+; X86-NEXT:    cmovll %esi, %ecx
 ; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    negl %ebp
-; X86-NEXT:    cmovll %ebx, %ebp
-; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    negl %esi
+; X86-NEXT:    cmovll %ebx, %esi
+; X86-NEXT:    movl %ebp, %ebx
 ; X86-NEXT:    negl %ebx
-; X86-NEXT:    cmovll %edi, %ebx
-; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    cmovll %ebp, %ebx
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    negl %ebp
+; X86-NEXT:    cmovll %edi, %ebp
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    negl %edi
-; X86-NEXT:    cmovll %esi, %edi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    negl %esi
-; X86-NEXT:    cmovll %eax, %esi
+; X86-NEXT:    cmovll %eax, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    negl %eax
@@ -375,10 +375,10 @@ define <8 x i32> @test_v8i32(<8 x i32> %a) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %ecx, 28(%edx)
 ; X86-NEXT:    movl %eax, 24(%edx)
-; X86-NEXT:    movl %esi, 20(%edx)
-; X86-NEXT:    movl %edi, 16(%edx)
+; X86-NEXT:    movl %edi, 20(%edx)
+; X86-NEXT:    movl %ebp, 16(%edx)
 ; X86-NEXT:    movl %ebx, 12(%edx)
-; X86-NEXT:    movl %ebp, 8(%edx)
+; X86-NEXT:    movl %esi, 8(%edx)
 ; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, 4(%edx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -415,31 +415,31 @@ define <8 x i16> @test_v8i16(<8 x i16> %a) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    negw %cx
 ; X86-NEXT:    cmovlw %dx, %cx
 ; X86-NEXT:    movw %cx, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
-; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    negw %cx
-; X86-NEXT:    cmovlw %bp, %cx
+; X86-NEXT:    cmovlw %si, %cx
 ; X86-NEXT:    movw %cx, (%esp) # 2-byte Spill
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    negw %bp
-; X86-NEXT:    cmovlw %bx, %bp
-; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    negw %si
+; X86-NEXT:    cmovlw %bx, %si
+; X86-NEXT:    movl %ebp, %ebx
 ; X86-NEXT:    negw %bx
-; X86-NEXT:    cmovlw %di, %bx
-; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    cmovlw %bp, %bx
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    negw %bp
+; X86-NEXT:    cmovlw %di, %bp
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    negw %di
-; X86-NEXT:    cmovlw %si, %di
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    negw %si
-; X86-NEXT:    cmovlw %ax, %si
+; X86-NEXT:    cmovlw %ax, %di
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    negw %ax
@@ -451,10 +451,10 @@ define <8 x i16> @test_v8i16(<8 x i16> %a) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movw %cx, 14(%edx)
 ; X86-NEXT:    movw %ax, 12(%edx)
-; X86-NEXT:    movw %si, 10(%edx)
-; X86-NEXT:    movw %di, 8(%edx)
+; X86-NEXT:    movw %di, 10(%edx)
+; X86-NEXT:    movw %bp, 8(%edx)
 ; X86-NEXT:    movw %bx, 6(%edx)
-; X86-NEXT:    movw %bp, 4(%edx)
+; X86-NEXT:    movw %si, 4(%edx)
 ; X86-NEXT:    movzwl (%esp), %eax # 2-byte Folded Reload
 ; X86-NEXT:    movw %ax, 2(%edx)
 ; X86-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload

diff  --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
index b6efdd7c6989a..4c04b07355795 100644
--- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
@@ -1564,11 +1564,11 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL_X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    kmovw %k1, %ebx
 ; KNL_X32-NEXT:    kshiftrw $1, %k0, %k1
-; KNL_X32-NEXT:    kmovw %k1, %esi
+; KNL_X32-NEXT:    kmovw %k1, %ebp
 ; KNL_X32-NEXT:    kshiftrw $2, %k0, %k1
-; KNL_X32-NEXT:    kmovw %k1, %edi
+; KNL_X32-NEXT:    kmovw %k1, %esi
 ; KNL_X32-NEXT:    kshiftrw $3, %k0, %k1
-; KNL_X32-NEXT:    kmovw %k1, %ebp
+; KNL_X32-NEXT:    kmovw %k1, %edi
 ; KNL_X32-NEXT:    kshiftrw $4, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %edx
 ; KNL_X32-NEXT:    kshiftrw $5, %k0, %k1
@@ -1578,66 +1578,66 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL_X32-NEXT:    movb %bl, 2(%eax)
 ; KNL_X32-NEXT:    kmovw %k0, %ebx
 ; KNL_X32-NEXT:    andl $1, %ebx
-; KNL_X32-NEXT:    andl $1, %esi
-; KNL_X32-NEXT:    leal (%ebx,%esi,2), %esi
-; KNL_X32-NEXT:    kmovw %k1, %ebx
+; KNL_X32-NEXT:    andl $1, %ebp
+; KNL_X32-NEXT:    leal (%ebx,%ebp,2), %ebx
+; KNL_X32-NEXT:    kmovw %k1, %ebp
 ; KNL_X32-NEXT:    kshiftrw $7, %k0, %k1
+; KNL_X32-NEXT:    andl $1, %esi
+; KNL_X32-NEXT:    leal (%ebx,%esi,4), %ebx
+; KNL_X32-NEXT:    kmovw %k1, %esi
+; KNL_X32-NEXT:    kshiftrw $8, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %edi
-; KNL_X32-NEXT:    leal (%esi,%edi,4), %esi
+; KNL_X32-NEXT:    leal (%ebx,%edi,8), %ebx
 ; KNL_X32-NEXT:    kmovw %k1, %edi
-; KNL_X32-NEXT:    kshiftrw $8, %k0, %k1
-; KNL_X32-NEXT:    andl $1, %ebp
-; KNL_X32-NEXT:    leal (%esi,%ebp,8), %esi
-; KNL_X32-NEXT:    kmovw %k1, %ebp
 ; KNL_X32-NEXT:    kshiftrw $9, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %edx
 ; KNL_X32-NEXT:    shll $4, %edx
-; KNL_X32-NEXT:    orl %esi, %edx
-; KNL_X32-NEXT:    kmovw %k1, %esi
+; KNL_X32-NEXT:    orl %ebx, %edx
+; KNL_X32-NEXT:    kmovw %k1, %ebx
 ; KNL_X32-NEXT:    kshiftrw $10, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %ecx
 ; KNL_X32-NEXT:    shll $5, %ecx
 ; KNL_X32-NEXT:    orl %edx, %ecx
 ; KNL_X32-NEXT:    kmovw %k1, %edx
 ; KNL_X32-NEXT:    kshiftrw $11, %k0, %k1
-; KNL_X32-NEXT:    andl $1, %ebx
-; KNL_X32-NEXT:    shll $6, %ebx
-; KNL_X32-NEXT:    andl $1, %edi
-; KNL_X32-NEXT:    shll $7, %edi
-; KNL_X32-NEXT:    orl %ebx, %edi
-; KNL_X32-NEXT:    kmovw %k1, %ebx
-; KNL_X32-NEXT:    kshiftrw $12, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %ebp
-; KNL_X32-NEXT:    shll $8, %ebp
-; KNL_X32-NEXT:    orl %edi, %ebp
-; KNL_X32-NEXT:    kmovw %k1, %edi
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k1
+; KNL_X32-NEXT:    shll $6, %ebp
 ; KNL_X32-NEXT:    andl $1, %esi
-; KNL_X32-NEXT:    shll $9, %esi
+; KNL_X32-NEXT:    shll $7, %esi
 ; KNL_X32-NEXT:    orl %ebp, %esi
 ; KNL_X32-NEXT:    kmovw %k1, %ebp
+; KNL_X32-NEXT:    kshiftrw $12, %k0, %k1
+; KNL_X32-NEXT:    andl $1, %edi
+; KNL_X32-NEXT:    shll $8, %edi
+; KNL_X32-NEXT:    orl %esi, %edi
+; KNL_X32-NEXT:    kmovw %k1, %esi
+; KNL_X32-NEXT:    kshiftrw $13, %k0, %k1
+; KNL_X32-NEXT:    andl $1, %ebx
+; KNL_X32-NEXT:    shll $9, %ebx
+; KNL_X32-NEXT:    orl %edi, %ebx
+; KNL_X32-NEXT:    kmovw %k1, %edi
 ; KNL_X32-NEXT:    kshiftrw $14, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %edx
 ; KNL_X32-NEXT:    shll $10, %edx
-; KNL_X32-NEXT:    orl %esi, %edx
-; KNL_X32-NEXT:    kmovw %k1, %esi
+; KNL_X32-NEXT:    orl %ebx, %edx
+; KNL_X32-NEXT:    kmovw %k1, %ebx
 ; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL_X32-NEXT:    orl %ecx, %edx
 ; KNL_X32-NEXT:    kmovw %k0, %ecx
-; KNL_X32-NEXT:    andl $1, %ebx
-; KNL_X32-NEXT:    shll $11, %ebx
-; KNL_X32-NEXT:    andl $1, %edi
-; KNL_X32-NEXT:    shll $12, %edi
-; KNL_X32-NEXT:    orl %ebx, %edi
 ; KNL_X32-NEXT:    andl $1, %ebp
-; KNL_X32-NEXT:    shll $13, %ebp
-; KNL_X32-NEXT:    orl %edi, %ebp
+; KNL_X32-NEXT:    shll $11, %ebp
 ; KNL_X32-NEXT:    andl $1, %esi
-; KNL_X32-NEXT:    shll $14, %esi
+; KNL_X32-NEXT:    shll $12, %esi
 ; KNL_X32-NEXT:    orl %ebp, %esi
+; KNL_X32-NEXT:    andl $1, %edi
+; KNL_X32-NEXT:    shll $13, %edi
+; KNL_X32-NEXT:    orl %esi, %edi
+; KNL_X32-NEXT:    andl $1, %ebx
+; KNL_X32-NEXT:    shll $14, %ebx
+; KNL_X32-NEXT:    orl %edi, %ebx
 ; KNL_X32-NEXT:    andl $1, %ecx
 ; KNL_X32-NEXT:    shll $15, %ecx
-; KNL_X32-NEXT:    orl %esi, %ecx
+; KNL_X32-NEXT:    orl %ebx, %ecx
 ; KNL_X32-NEXT:    orl %edx, %ecx
 ; KNL_X32-NEXT:    movw %cx, (%eax)
 ; KNL_X32-NEXT:    addl $20, %esp

diff  --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
index 605e0af8e62a7..5e53795bf1b97 100644
--- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
+++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
@@ -947,35 +947,35 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    subl %ecx, %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl %esi, %ebp
-; X32-NEXT:    subl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT:    imull %ebp, %ebx
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    subl %edi, %ebp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    imull %eax, %ebx
+; X32-NEXT:    movl %edx, %eax
+; X32-NEXT:    subl %edi, %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    imull %ebp, %ecx
+; X32-NEXT:    imull %eax, %ecx
 ; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %ebp
-; X32-NEXT:    subl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    movl %edi, %esi
+; X32-NEXT:    subl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    subl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    imull %ebp, %eax
+; X32-NEXT:    imull %esi, %eax
 ; X32-NEXT:    addl %eax, %ebx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    imull %eax, %esi
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    imull %eax, %ebp
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    imull %ebp, %edx
-; X32-NEXT:    addl %esi, %edx
+; X32-NEXT:    imull %esi, %edx
+; X32-NEXT:    addl %ebp, %edx
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    imull %edi, %ecx
 ; X32-NEXT:    addl %edx, %ecx

diff  --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll
index c64a097c3b26d..6653ee0ad5573 100644
--- a/llvm/test/CodeGen/X86/avx512-select.ll
+++ b/llvm/test/CodeGen/X86/avx512-select.ll
@@ -556,22 +556,22 @@ define void @vselect_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y) nounwind {
 ; X86-AVX512F-LABEL: vselect_v1i1:
 ; X86-AVX512F:       # %bb.0:
 ; X86-AVX512F-NEXT:    pushl %esi
-; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX512F-NEXT:    movzbl (%edx), %esi
+; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512F-NEXT:    movzbl (%eax), %esi
 ; X86-AVX512F-NEXT:    kmovw %esi, %k0
+; X86-AVX512F-NEXT:    movzbl (%edx), %edx
+; X86-AVX512F-NEXT:    kmovw %edx, %k1
 ; X86-AVX512F-NEXT:    movzbl (%ecx), %ecx
-; X86-AVX512F-NEXT:    kmovw %ecx, %k1
-; X86-AVX512F-NEXT:    movzbl (%eax), %eax
-; X86-AVX512F-NEXT:    kmovw %eax, %k2
+; X86-AVX512F-NEXT:    kmovw %ecx, %k2
 ; X86-AVX512F-NEXT:    kandnw %k1, %k2, %k1
 ; X86-AVX512F-NEXT:    kandw %k2, %k0, %k0
 ; X86-AVX512F-NEXT:    korw %k1, %k0, %k0
 ; X86-AVX512F-NEXT:    kshiftlw $15, %k0, %k0
 ; X86-AVX512F-NEXT:    kshiftrw $15, %k0, %k0
-; X86-AVX512F-NEXT:    kmovw %k0, %eax
-; X86-AVX512F-NEXT:    movb %al, (%edx)
+; X86-AVX512F-NEXT:    kmovw %k0, %ecx
+; X86-AVX512F-NEXT:    movb %cl, (%eax)
 ; X86-AVX512F-NEXT:    popl %esi
 ; X86-AVX512F-NEXT:    retl
 ;
@@ -595,22 +595,22 @@ define void @vselect_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y) nounwind {
 ; X86-AVX512BW-LABEL: vselect_v1i1:
 ; X86-AVX512BW:       # %bb.0:
 ; X86-AVX512BW-NEXT:    pushl %esi
-; X86-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX512BW-NEXT:    movzbl (%edx), %esi
+; X86-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512BW-NEXT:    movzbl (%eax), %esi
 ; X86-AVX512BW-NEXT:    kmovd %esi, %k0
+; X86-AVX512BW-NEXT:    movzbl (%edx), %edx
+; X86-AVX512BW-NEXT:    kmovd %edx, %k1
 ; X86-AVX512BW-NEXT:    movzbl (%ecx), %ecx
-; X86-AVX512BW-NEXT:    kmovd %ecx, %k1
-; X86-AVX512BW-NEXT:    movzbl (%eax), %eax
-; X86-AVX512BW-NEXT:    kmovd %eax, %k2
+; X86-AVX512BW-NEXT:    kmovd %ecx, %k2
 ; X86-AVX512BW-NEXT:    kandnw %k1, %k2, %k1
 ; X86-AVX512BW-NEXT:    kandw %k2, %k0, %k0
 ; X86-AVX512BW-NEXT:    korw %k1, %k0, %k0
 ; X86-AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
 ; X86-AVX512BW-NEXT:    kshiftrw $15, %k0, %k0
-; X86-AVX512BW-NEXT:    kmovd %k0, %eax
-; X86-AVX512BW-NEXT:    movb %al, (%edx)
+; X86-AVX512BW-NEXT:    kmovd %k0, %ecx
+; X86-AVX512BW-NEXT:    movb %cl, (%eax)
 ; X86-AVX512BW-NEXT:    popl %esi
 ; X86-AVX512BW-NEXT:    retl
 ;

diff  --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index d47e7bee65b96..723eae9fb2f44 100644
--- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -1856,32 +1856,32 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 ; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
-; X86-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
-; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
 ; X86-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0 # encoding: [0x62,0xf1,0x75,0x48,0x64,0xc0]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
-; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
+; X86-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
 ; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
-; X86-NEXT:    addl %ecx, %esi # encoding: [0x01,0xce]
-; X86-NEXT:    adcl %eax, %edx # encoding: [0x11,0xc2]
+; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
+; X86-NEXT:    adcl %ecx, %eax # encoding: [0x11,0xc8]
 ; X86-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x02]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
-; X86-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
-; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X86-NEXT:    addl %esi, %ecx # encoding: [0x01,0xf1]
-; X86-NEXT:    adcl %edx, %eax # encoding: [0x11,0xd0]
+; X86-NEXT:    kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT:    addl %esi, %edx # encoding: [0x01,0xf2]
+; X86-NEXT:    adcl %eax, %ecx # encoding: [0x11,0xc1]
 ; X86-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x04]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
-; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
+; X86-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
 ; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
-; X86-NEXT:    addl %ecx, %esi # encoding: [0x01,0xce]
-; X86-NEXT:    adcl %eax, %edx # encoding: [0x11,0xc2]
+; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
+; X86-NEXT:    adcl %ecx, %eax # encoding: [0x11,0xc8]
 ; X86-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x05]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
 ; X86-NEXT:    kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9]
 ; X86-NEXT:    kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8]
 ; X86-NEXT:    addl %esi, %edi # encoding: [0x01,0xf7]
-; X86-NEXT:    adcl %edx, %ecx # encoding: [0x11,0xd1]
+; X86-NEXT:    adcl %eax, %ecx # encoding: [0x11,0xc1]
 ; X86-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
 ; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
@@ -1942,32 +1942,32 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwin
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c]
 ; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1]
 ; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
-; X86-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
-; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
 ; X86-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x64,0xc0]
 ; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
-; X86-NEXT:    kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2]
+; X86-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
 ; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
-; X86-NEXT:    addl %ecx, %esi # encoding: [0x01,0xce]
-; X86-NEXT:    adcl %eax, %edx # encoding: [0x11,0xc2]
+; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
+; X86-NEXT:    adcl %ecx, %eax # encoding: [0x11,0xc8]
 ; X86-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x02]
 ; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
-; X86-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
-; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X86-NEXT:    addl %esi, %ecx # encoding: [0x01,0xf1]
-; X86-NEXT:    adcl %edx, %eax # encoding: [0x11,0xd0]
+; X86-NEXT:    kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT:    addl %esi, %edx # encoding: [0x01,0xf2]
+; X86-NEXT:    adcl %eax, %ecx # encoding: [0x11,0xc1]
 ; X86-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
 ; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
-; X86-NEXT:    kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2]
+; X86-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
 ; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
-; X86-NEXT:    addl %ecx, %esi # encoding: [0x01,0xce]
-; X86-NEXT:    adcl %eax, %edx # encoding: [0x11,0xc2]
+; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
+; X86-NEXT:    adcl %ecx, %eax # encoding: [0x11,0xc8]
 ; X86-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05]
 ; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
 ; X86-NEXT:    kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca]
 ; X86-NEXT:    kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8]
 ; X86-NEXT:    addl %esi, %edi # encoding: [0x01,0xf7]
-; X86-NEXT:    adcl %edx, %ecx # encoding: [0x11,0xd1]
+; X86-NEXT:    adcl %eax, %ecx # encoding: [0x11,0xc1]
 ; X86-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x64,0xc1]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
 ; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
@@ -2031,32 +2031,32 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 ; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
-; X86-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
-; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
 ; X86-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x01]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
-; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
+; X86-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
 ; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
-; X86-NEXT:    addl %ecx, %esi # encoding: [0x01,0xce]
-; X86-NEXT:    adcl %eax, %edx # encoding: [0x11,0xc2]
+; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
+; X86-NEXT:    adcl %ecx, %eax # encoding: [0x11,0xc8]
 ; X86-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x02]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
-; X86-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
-; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X86-NEXT:    addl %esi, %ecx # encoding: [0x01,0xf1]
-; X86-NEXT:    adcl %edx, %eax # encoding: [0x11,0xd0]
+; X86-NEXT:    kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT:    addl %esi, %edx # encoding: [0x01,0xf2]
+; X86-NEXT:    adcl %eax, %ecx # encoding: [0x11,0xc1]
 ; X86-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x04]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
-; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
+; X86-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
 ; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
-; X86-NEXT:    addl %ecx, %esi # encoding: [0x01,0xce]
-; X86-NEXT:    adcl %eax, %edx # encoding: [0x11,0xc2]
+; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
+; X86-NEXT:    adcl %ecx, %eax # encoding: [0x11,0xc8]
 ; X86-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x05]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
 ; X86-NEXT:    kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9]
 ; X86-NEXT:    kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8]
 ; X86-NEXT:    addl %esi, %edi # encoding: [0x01,0xf7]
-; X86-NEXT:    adcl %edx, %ecx # encoding: [0x11,0xd1]
+; X86-NEXT:    adcl %eax, %ecx # encoding: [0x11,0xc1]
 ; X86-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x06]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
 ; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
@@ -2117,32 +2117,32 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c]
 ; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1]
 ; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
-; X86-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
-; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
 ; X86-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x01]
 ; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
-; X86-NEXT:    kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2]
+; X86-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
 ; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
-; X86-NEXT:    addl %ecx, %esi # encoding: [0x01,0xce]
-; X86-NEXT:    adcl %eax, %edx # encoding: [0x11,0xc2]
+; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
+; X86-NEXT:    adcl %ecx, %eax # encoding: [0x11,0xc8]
 ; X86-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x02]
 ; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
-; X86-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
-; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X86-NEXT:    addl %esi, %ecx # encoding: [0x01,0xf1]
-; X86-NEXT:    adcl %edx, %eax # encoding: [0x11,0xd0]
+; X86-NEXT:    kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT:    addl %esi, %edx # encoding: [0x01,0xf2]
+; X86-NEXT:    adcl %eax, %ecx # encoding: [0x11,0xc1]
 ; X86-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
 ; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
-; X86-NEXT:    kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2]
+; X86-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
 ; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
-; X86-NEXT:    addl %ecx, %esi # encoding: [0x01,0xce]
-; X86-NEXT:    adcl %eax, %edx # encoding: [0x11,0xc2]
+; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
+; X86-NEXT:    adcl %ecx, %eax # encoding: [0x11,0xc8]
 ; X86-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05]
 ; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
 ; X86-NEXT:    kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca]
 ; X86-NEXT:    kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8]
 ; X86-NEXT:    addl %esi, %edi # encoding: [0x01,0xf7]
-; X86-NEXT:    adcl %edx, %ecx # encoding: [0x11,0xd1]
+; X86-NEXT:    adcl %eax, %ecx # encoding: [0x11,0xc1]
 ; X86-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x06]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
 ; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]

diff  --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
index 714dcd7de929d..f9b50b0e2b298 100644
--- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
@@ -4810,9 +4810,9 @@ define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
 ; X86-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
 ; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X86-NEXT:    vpcmpgtb %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x64,0xc0]
-; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X86-NEXT:    vpcmpleb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x02]
 ; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT:    vpcmpleb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
 ; X86-NEXT:    vpcmpneqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x04]
 ; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
 ; X86-NEXT:    vpcmpnltb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x05]
@@ -4825,11 +4825,11 @@ define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
 ; X86-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
 ; X86-NEXT:    vpblendd $8, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08]
 ; X86-NEXT:    # xmm0 = xmm0[0,1,2],xmm1[3]
-; X86-NEXT:    vmovd %ecx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc9]
+; X86-NEXT:    vmovd %edx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xca]
 ; X86-NEXT:    vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0]
 ; X86-NEXT:    vpunpckldq %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9]
 ; X86-NEXT:    # xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-NEXT:    vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
+; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
 ; X86-NEXT:    vpunpcklqdq %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xca]
 ; X86-NEXT:    # xmm1 = xmm1[0],xmm2[0]
 ; X86-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
@@ -5004,9 +5004,9 @@ define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
 ; X86-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
 ; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X86-NEXT:    vpcmpltub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x01]
-; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X86-NEXT:    vpcmpleub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x02]
 ; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT:    vpcmpleub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
 ; X86-NEXT:    vpcmpneqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x04]
 ; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
 ; X86-NEXT:    vpcmpnltub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x05]
@@ -5019,11 +5019,11 @@ define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
 ; X86-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
 ; X86-NEXT:    vpblendd $8, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08]
 ; X86-NEXT:    # xmm0 = xmm0[0,1,2],xmm1[3]
-; X86-NEXT:    vmovd %ecx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc9]
+; X86-NEXT:    vmovd %edx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xca]
 ; X86-NEXT:    vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0]
 ; X86-NEXT:    vpunpckldq %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9]
 ; X86-NEXT:    # xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-NEXT:    vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
+; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
 ; X86-NEXT:    vpunpcklqdq %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xca]
 ; X86-NEXT:    # xmm1 = xmm1[0],xmm2[0]
 ; X86-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]

diff  --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index cfdbbce7f1f56..33f03b99de1ed 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -653,8 +653,7 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
 ; X86-NEXT:    shrl %edi
 ; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NEXT:    leal (%edi,%ebx,2), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal (%edi,%ebx,2), %ebx
 ; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
@@ -671,7 +670,8 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
 ; X86-NEXT:    shrl %esi
 ; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
-; X86-NEXT:    leal (%esi,%edi,2), %ebx
+; X86-NEXT:    leal (%esi,%edi,2), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
@@ -934,13 +934,13 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; X86-NEXT:    leal (%eax,%ecx,2), %edx
 ; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    shrdl $16, %ebx, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shrdl $16, %eax, %esi
-; X86-NEXT:    shrdl $16, %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shrdl $16, %ecx, %ebx
+; X86-NEXT:    shrdl $16, %eax, %ebx
 ; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shrdl $16, %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    shrdl $16, %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -998,9 +998,9 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    movl %ecx, 16(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, 4(%eax)
 ; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    shrl $16, %edx

diff  --git a/llvm/test/CodeGen/X86/bool-vector.ll b/llvm/test/CodeGen/X86/bool-vector.ll
index 8a1fc46e3a5c5..abac07032d83d 100644
--- a/llvm/test/CodeGen/X86/bool-vector.ll
+++ b/llvm/test/CodeGen/X86/bool-vector.ll
@@ -10,18 +10,18 @@ define i32 @PR15215_bad(<4 x i32> %input) {
 ; X86-LABEL: PR15215_bad:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
 ; X86-NEXT:    addb %ah, %ah
-; X86-NEXT:    andb $1, %dl
-; X86-NEXT:    orb %ah, %dl
-; X86-NEXT:    shlb $2, %dl
-; X86-NEXT:    addb %cl, %cl
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    orb %ah, %cl
+; X86-NEXT:    shlb $2, %cl
+; X86-NEXT:    addb %dl, %dl
 ; X86-NEXT:    andb $1, %al
-; X86-NEXT:    orb %cl, %al
-; X86-NEXT:    andb $3, %al
 ; X86-NEXT:    orb %dl, %al
+; X86-NEXT:    andb $3, %al
+; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    andl $15, %eax
 ; X86-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/bswap.ll b/llvm/test/CodeGen/X86/bswap.ll
index ad07122b5b7b1..6af70c614c8e7 100644
--- a/llvm/test/CodeGen/X86/bswap.ll
+++ b/llvm/test/CodeGen/X86/bswap.ll
@@ -277,14 +277,10 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; CHECK-NEXT:    bswapl %ebp
 ; CHECK-NEXT:    shrdl $16, %ebp, %ebx
 ; CHECK-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    bswapl %ecx
-; CHECK-NEXT:    shrdl $16, %ecx, %ebp
-; CHECK-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    bswapl %eax
-; CHECK-NEXT:    shrdl $16, %eax, %ecx
-; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    shrdl $16, %eax, %ebp
+; CHECK-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    bswapl %ecx
 ; CHECK-NEXT:    shrdl $16, %ecx, %eax
@@ -293,10 +289,14 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; CHECK-NEXT:    bswapl %eax
 ; CHECK-NEXT:    shrdl $16, %eax, %ecx
 ; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    bswapl %ecx
+; CHECK-NEXT:    shrdl $16, %ecx, %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; CHECK-NEXT:    bswapl %ebp
-; CHECK-NEXT:    shrdl $16, %ebp, %eax
-; CHECK-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; CHECK-NEXT:    shrdl $16, %ebp, %ecx
+; CHECK-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; CHECK-NEXT:    bswapl %ebx
 ; CHECK-NEXT:    shrdl $16, %ebx, %ebp

diff  --git a/llvm/test/CodeGen/X86/build-vector-128.ll b/llvm/test/CodeGen/X86/build-vector-128.ll
index 84ebabc927c16..7b03005f08056 100644
--- a/llvm/test/CodeGen/X86/build-vector-128.ll
+++ b/llvm/test/CodeGen/X86/build-vector-128.ll
@@ -252,29 +252,29 @@ define <16 x i8> @test_buildvector_v16i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
 ; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
 ; SSE2-32-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; SSE2-32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-32-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-32-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
 ; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-32-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSE2-32-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
 ; SSE2-32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-32-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-32-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-32-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE2-32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-32-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-32-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; SSE2-32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-32-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-32-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-32-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-32-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
 ; SSE2-32-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
 ; SSE2-32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-32-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-32-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSE2-32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-32-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-32-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE2-32-NEXT:    retl
 ;
 ; SSE2-64-LABEL: test_buildvector_v16i8:

diff  --git a/llvm/test/CodeGen/X86/clear-highbits.ll b/llvm/test/CodeGen/X86/clear-highbits.ll
index b382054bfff3a..358db2754f091 100644
--- a/llvm/test/CodeGen/X86/clear-highbits.ll
+++ b/llvm/test/CodeGen/X86/clear-highbits.ll
@@ -1128,35 +1128,32 @@ define i64 @oneusei64_d(i64 %val, i64 %numhighbits, i64* %escape) nounwind {
 ;
 ; X86-BMI1-LABEL: oneusei64_d:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebp
 ; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
 ; X86-BMI1-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl %eax, %edi
-; X86-BMI1-NEXT:    shll %cl, %edi
-; X86-BMI1-NEXT:    shldl %cl, %eax, %esi
+; X86-BMI1-NEXT:    movl %edx, %eax
+; X86-BMI1-NEXT:    shll %cl, %eax
+; X86-BMI1-NEXT:    shldl %cl, %edx, %esi
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    cmovnel %edi, %esi
-; X86-BMI1-NEXT:    movl %esi, %ebx
-; X86-BMI1-NEXT:    shrl %cl, %ebx
+; X86-BMI1-NEXT:    cmovnel %eax, %esi
+; X86-BMI1-NEXT:    movl %esi, %edi
+; X86-BMI1-NEXT:    shrl %cl, %edi
 ; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    cmovnel %edx, %edi
-; X86-BMI1-NEXT:    cmovel %ebx, %edx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-BMI1-NEXT:    movl %edi, (%ebp)
-; X86-BMI1-NEXT:    movl %edi, %eax
+; X86-BMI1-NEXT:    cmovnel %edx, %eax
+; X86-BMI1-NEXT:    cmovel %edi, %edx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI1-NEXT:    movl %eax, (%ebx)
 ; X86-BMI1-NEXT:    shrdl %cl, %esi, %eax
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    movl %esi, 4(%ebp)
-; X86-BMI1-NEXT:    cmovnel %ebx, %eax
+; X86-BMI1-NEXT:    movl %esi, 4(%ebx)
+; X86-BMI1-NEXT:    cmovnel %edi, %eax
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    popl %ebx
-; X86-BMI1-NEXT:    popl %ebp
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: oneusei64_d:

diff  --git a/llvm/test/CodeGen/X86/combine-sbb.ll b/llvm/test/CodeGen/X86/combine-sbb.ll
index efb69d8924b9c..2811b7f86e994 100644
--- a/llvm/test/CodeGen/X86/combine-sbb.ll
+++ b/llvm/test/CodeGen/X86/combine-sbb.ll
@@ -63,23 +63,23 @@ define void @PR25858_i64(%WideUInt64* sret(%WideUInt64), %WideUInt64*, %WideUInt
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl (%edx), %esi
-; X86-NEXT:    movl 4(%edx), %edi
-; X86-NEXT:    subl (%ecx), %esi
-; X86-NEXT:    sbbl 4(%ecx), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl (%edi), %ecx
+; X86-NEXT:    movl 4(%edi), %edx
+; X86-NEXT:    subl (%esi), %ecx
+; X86-NEXT:    sbbl 4(%esi), %edx
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    movl 12(%edx), %ebp
-; X86-NEXT:    movl 8(%edx), %edx
-; X86-NEXT:    subl 8(%ecx), %edx
-; X86-NEXT:    sbbl 12(%ecx), %ebp
-; X86-NEXT:    movzbl %bl, %ecx
-; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    movl 12(%edi), %ebp
+; X86-NEXT:    movl 8(%edi), %edi
+; X86-NEXT:    subl 8(%esi), %edi
+; X86-NEXT:    sbbl 12(%esi), %ebp
+; X86-NEXT:    movzbl %bl, %esi
+; X86-NEXT:    subl %esi, %edi
 ; X86-NEXT:    sbbl $0, %ebp
-; X86-NEXT:    movl %esi, (%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
 ; X86-NEXT:    movl %ebp, 12(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi

diff  --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 1a8962f91b887..ef83f268e0d6f 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -124,10 +124,10 @@ define i64 @scalar_i64(i64 %x, i64 %y, i64* %divdst) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __divdi3
@@ -136,10 +136,10 @@ define i64 @scalar_i64(i64 %x, i64 %y, i64* %divdst) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %ecx, 4(%edx)
 ; X86-NEXT:    movl %eax, (%edx)
-; X86-NEXT:    imull %eax, %ebx
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %ebx, %edx
-; X86-NEXT:    imull %ebp, %ecx
+; X86-NEXT:    imull %eax, %ebp
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    imull %ebx, %ecx
 ; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    subl %eax, %esi
 ; X86-NEXT:    sbbl %ecx, %edi
@@ -178,15 +178,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, i128* %divdst) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $48, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    movl 44(%ebp), %edi
-; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl 40(%ebp)
 ; X86-NEXT:    pushl 36(%ebp)
 ; X86-NEXT:    pushl 32(%ebp)
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    pushl 28(%ebp)
 ; X86-NEXT:    pushl 24(%ebp)
 ; X86-NEXT:    pushl 20(%ebp)
 ; X86-NEXT:    pushl 16(%ebp)
@@ -194,18 +192,18 @@ define i128 @scalar_i128(i128 %x, i128 %y, i128* %divdst) nounwind {
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __divti3
 ; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    movl %ecx, 12(%edi)
 ; X86-NEXT:    movl %esi, 8(%edi)
 ; X86-NEXT:    movl %eax, 4(%edi)
-; X86-NEXT:    movl %edx, (%edi)
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    imull %ebx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %ebx, (%edx)
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    imull %eax, %ecx
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %ecx, %edx
@@ -213,40 +211,38 @@ define i128 @scalar_i128(i128 %x, i128 %y, i128* %divdst) nounwind {
 ; X86-NEXT:    addl %edx, %esi
 ; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    imull %ebx, %ecx
-; X86-NEXT:    mull %edi
+; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl 40(%ebp), %edi
-; X86-NEXT:    imull %eax, %edi
-; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    imull %ebx, %eax
+; X86-NEXT:    addl %edx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl 28(%ebp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull 32(%ebp)
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull 32(%ebp)
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
@@ -254,7 +250,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, i128* %divdst) nounwind {
 ; X86-NEXT:    movl 12(%ebp), %ecx
 ; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl 16(%ebp), %esi
-; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    sbbl (%esp), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl 20(%ebp), %edi
 ; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    movl 24(%ebp), %ebx
@@ -386,35 +382,35 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, <16 x i8>* %divdst)
 ; X86-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; X86-NEXT:    movd %edx, %xmm4
 ; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; X86-NEXT:    movd %esi, %xmm2
+; X86-NEXT:    movd %esi, %xmm7
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; X86-NEXT:    movd %edi, %xmm5
-; X86-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; X86-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; X86-NEXT:    movd %edi, %xmm2
+; X86-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; X86-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
 ; X86-NEXT:    movd %ebx, %xmm4
 ; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movd %ecx, %xmm6
+; X86-NEXT:    movd %ecx, %xmm5
 ; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; X86-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movd %eax, %xmm5
-; X86-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
-; X86-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; X86-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; X86-NEXT:    movdqa %xmm5, %xmm2
-; X86-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X86-NEXT:    movdqa %xmm2, (%ecx)
+; X86-NEXT:    movd %eax, %xmm2
+; X86-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; X86-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1]
+; X86-NEXT:    movdqa %xmm2, %xmm4
+; X86-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
+; X86-NEXT:    movdqa %xmm4, (%ecx)
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-NEXT:    movdqa %xmm1, %xmm2
-; X86-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X86-NEXT:    pmullw %xmm3, %xmm2
+; X86-NEXT:    movdqa %xmm1, %xmm4
+; X86-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X86-NEXT:    pmullw %xmm3, %xmm4
 ; X86-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; X86-NEXT:    pand %xmm3, %xmm2
-; X86-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-NEXT:    pand %xmm3, %xmm4
+; X86-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-NEXT:    pmullw %xmm5, %xmm1
+; X86-NEXT:    pmullw %xmm2, %xmm1
 ; X86-NEXT:    pand %xmm3, %xmm1
-; X86-NEXT:    packuswb %xmm2, %xmm1
+; X86-NEXT:    packuswb %xmm4, %xmm1
 ; X86-NEXT:    psubb %xmm1, %xmm0
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
@@ -585,22 +581,22 @@ define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, <8 x i16>* %divdst
 ; X86-NEXT:    cwtd
 ; X86-NEXT:    idivw %si
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    movd %eax, %xmm3
+; X86-NEXT:    movd %eax, %xmm4
 ; X86-NEXT:    pextrw $2, %xmm0, %eax
 ; X86-NEXT:    pextrw $2, %xmm1, %esi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    cwtd
 ; X86-NEXT:    idivw %si
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    movd %eax, %xmm4
-; X86-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; X86-NEXT:    movd %eax, %xmm3
+; X86-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; X86-NEXT:    pextrw $1, %xmm0, %eax
 ; X86-NEXT:    pextrw $1, %xmm1, %esi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    cwtd
 ; X86-NEXT:    idivw %si
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    movd %eax, %xmm3
+; X86-NEXT:    movd %eax, %xmm4
 ; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    movd %xmm1, %esi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -608,8 +604,8 @@ define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, <8 x i16>* %divdst
 ; X86-NEXT:    idivw %si
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    movd %eax, %xmm5
-; X86-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; X86-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; X86-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; X86-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
 ; X86-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]
 ; X86-NEXT:    movdqa %xmm5, (%ecx)
 ; X86-NEXT:    pmullw %xmm1, %xmm5
@@ -704,20 +700,20 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst
 ; X86-NEXT:    movd %xmm2, %esi
 ; X86-NEXT:    cltd
 ; X86-NEXT:    idivl %esi
-; X86-NEXT:    movd %eax, %xmm2
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    movd %xmm3, %eax
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X86-NEXT:    movd %xmm3, %esi
+; X86-NEXT:    movd %eax, %xmm3
+; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X86-NEXT:    movd %xmm2, %eax
+; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X86-NEXT:    movd %xmm2, %esi
 ; X86-NEXT:    cltd
 ; X86-NEXT:    idivl %esi
-; X86-NEXT:    movd %eax, %xmm3
-; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X86-NEXT:    movd %eax, %xmm2
+; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    movd %xmm1, %esi
 ; X86-NEXT:    cltd
 ; X86-NEXT:    idivl %esi
-; X86-NEXT:    movd %eax, %xmm2
+; X86-NEXT:    movd %eax, %xmm3
 ; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
 ; X86-NEXT:    movd %xmm4, %eax
 ; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
@@ -725,17 +721,17 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst
 ; X86-NEXT:    cltd
 ; X86-NEXT:    idivl %esi
 ; X86-NEXT:    movd %eax, %xmm4
-; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; X86-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X86-NEXT:    movdqa %xmm2, (%ecx)
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; X86-NEXT:    pmuludq %xmm1, %xmm2
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; X86-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; X86-NEXT:    movdqa %xmm3, (%ecx)
+; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-NEXT:    pmuludq %xmm1, %xmm3
+; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
 ; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-NEXT:    pmuludq %xmm3, %xmm1
+; X86-NEXT:    pmuludq %xmm2, %xmm1
 ; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-NEXT:    psubd %xmm2, %xmm0
+; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; X86-NEXT:    psubd %xmm3, %xmm0
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -817,25 +813,25 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst
 ; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; X86-NEXT:    movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    calll __divdi3
-; X86-NEXT:    movd %edx, %xmm0
-; X86-NEXT:    movd %eax, %xmm1
-; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-NEXT:    movd %edx, %xmm1
+; X86-NEXT:    movd %eax, %xmm3
+; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
 ; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X86-NEXT:    movdqa %xmm1, (%esi)
-; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload
-; X86-NEXT:    movdqa %xmm3, %xmm0
-; X86-NEXT:    psrlq $32, %xmm0
-; X86-NEXT:    pmuludq %xmm1, %xmm0
-; X86-NEXT:    movdqa %xmm1, %xmm2
+; X86-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; X86-NEXT:    movdqa %xmm3, (%esi)
+; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psrlq $32, %xmm1
+; X86-NEXT:    pmuludq %xmm3, %xmm1
+; X86-NEXT:    movdqa %xmm3, %xmm2
 ; X86-NEXT:    psrlq $32, %xmm2
-; X86-NEXT:    pmuludq %xmm3, %xmm2
-; X86-NEXT:    paddq %xmm0, %xmm2
+; X86-NEXT:    pmuludq %xmm0, %xmm2
+; X86-NEXT:    paddq %xmm1, %xmm2
 ; X86-NEXT:    psllq $32, %xmm2
-; X86-NEXT:    pmuludq %xmm3, %xmm1
-; X86-NEXT:    paddq %xmm2, %xmm1
+; X86-NEXT:    pmuludq %xmm0, %xmm3
+; X86-NEXT:    paddq %xmm2, %xmm3
 ; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT:    psubq %xmm1, %xmm0
+; X86-NEXT:    psubq %xmm3, %xmm0
 ; X86-NEXT:    addl $64, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 42d7965a1516c..e42a95115eca7 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -124,10 +124,10 @@ define i64 @scalar_i64(i64 %x, i64 %y, i64* %divdst) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __udivdi3
@@ -136,10 +136,10 @@ define i64 @scalar_i64(i64 %x, i64 %y, i64* %divdst) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %ecx, 4(%edx)
 ; X86-NEXT:    movl %eax, (%edx)
-; X86-NEXT:    imull %eax, %ebx
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %ebx, %edx
-; X86-NEXT:    imull %ebp, %ecx
+; X86-NEXT:    imull %eax, %ebp
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    imull %ebx, %ecx
 ; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    subl %eax, %esi
 ; X86-NEXT:    sbbl %ecx, %edi
@@ -178,15 +178,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, i128* %divdst) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $48, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    movl 44(%ebp), %edi
-; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl 40(%ebp)
 ; X86-NEXT:    pushl 36(%ebp)
 ; X86-NEXT:    pushl 32(%ebp)
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    pushl 28(%ebp)
 ; X86-NEXT:    pushl 24(%ebp)
 ; X86-NEXT:    pushl 20(%ebp)
 ; X86-NEXT:    pushl 16(%ebp)
@@ -194,18 +192,18 @@ define i128 @scalar_i128(i128 %x, i128 %y, i128* %divdst) nounwind {
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __udivti3
 ; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    movl %ecx, 12(%edi)
 ; X86-NEXT:    movl %esi, 8(%edi)
 ; X86-NEXT:    movl %eax, 4(%edi)
-; X86-NEXT:    movl %edx, (%edi)
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    imull %ebx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %ebx, (%edx)
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    imull %eax, %ecx
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %ecx, %edx
@@ -213,40 +211,38 @@ define i128 @scalar_i128(i128 %x, i128 %y, i128* %divdst) nounwind {
 ; X86-NEXT:    addl %edx, %esi
 ; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    imull %ebx, %ecx
-; X86-NEXT:    mull %edi
+; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl 40(%ebp), %edi
-; X86-NEXT:    imull %eax, %edi
-; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    imull %ebx, %eax
+; X86-NEXT:    addl %edx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl 28(%ebp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull 32(%ebp)
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull 32(%ebp)
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
@@ -254,7 +250,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, i128* %divdst) nounwind {
 ; X86-NEXT:    movl 12(%ebp), %ecx
 ; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl 16(%ebp), %esi
-; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    sbbl (%esp), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl 20(%ebp), %edi
 ; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    movl 24(%ebp), %ebx
@@ -386,35 +382,35 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, <16 x i8>* %divdst)
 ; X86-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; X86-NEXT:    movd %edx, %xmm4
 ; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; X86-NEXT:    movd %esi, %xmm2
+; X86-NEXT:    movd %esi, %xmm7
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; X86-NEXT:    movd %edi, %xmm5
-; X86-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; X86-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; X86-NEXT:    movd %edi, %xmm2
+; X86-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; X86-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
 ; X86-NEXT:    movd %ebx, %xmm4
 ; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movd %ecx, %xmm6
+; X86-NEXT:    movd %ecx, %xmm5
 ; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; X86-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movd %eax, %xmm5
-; X86-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
-; X86-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; X86-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; X86-NEXT:    movdqa %xmm5, %xmm2
-; X86-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X86-NEXT:    movdqa %xmm2, (%ecx)
+; X86-NEXT:    movd %eax, %xmm2
+; X86-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; X86-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1]
+; X86-NEXT:    movdqa %xmm2, %xmm4
+; X86-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
+; X86-NEXT:    movdqa %xmm4, (%ecx)
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-NEXT:    movdqa %xmm1, %xmm2
-; X86-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X86-NEXT:    pmullw %xmm3, %xmm2
+; X86-NEXT:    movdqa %xmm1, %xmm4
+; X86-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X86-NEXT:    pmullw %xmm3, %xmm4
 ; X86-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; X86-NEXT:    pand %xmm3, %xmm2
-; X86-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-NEXT:    pand %xmm3, %xmm4
+; X86-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-NEXT:    pmullw %xmm5, %xmm1
+; X86-NEXT:    pmullw %xmm2, %xmm1
 ; X86-NEXT:    pand %xmm3, %xmm1
-; X86-NEXT:    packuswb %xmm2, %xmm1
+; X86-NEXT:    packuswb %xmm4, %xmm1
 ; X86-NEXT:    psubb %xmm1, %xmm0
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
@@ -585,22 +581,22 @@ define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, <8 x i16>* %divdst
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divw %si
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    movd %eax, %xmm3
+; X86-NEXT:    movd %eax, %xmm4
 ; X86-NEXT:    pextrw $2, %xmm0, %eax
 ; X86-NEXT:    pextrw $2, %xmm1, %esi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divw %si
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    movd %eax, %xmm4
-; X86-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; X86-NEXT:    movd %eax, %xmm3
+; X86-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; X86-NEXT:    pextrw $1, %xmm0, %eax
 ; X86-NEXT:    pextrw $1, %xmm1, %esi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divw %si
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    movd %eax, %xmm3
+; X86-NEXT:    movd %eax, %xmm4
 ; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    movd %xmm1, %esi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -608,8 +604,8 @@ define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, <8 x i16>* %divdst
 ; X86-NEXT:    divw %si
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    movd %eax, %xmm5
-; X86-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; X86-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; X86-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; X86-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
 ; X86-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]
 ; X86-NEXT:    movdqa %xmm5, (%ecx)
 ; X86-NEXT:    pmullw %xmm1, %xmm5
@@ -704,20 +700,20 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst
 ; X86-NEXT:    movd %xmm2, %esi
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl %esi
-; X86-NEXT:    movd %eax, %xmm2
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    movd %xmm3, %eax
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X86-NEXT:    movd %xmm3, %esi
+; X86-NEXT:    movd %eax, %xmm3
+; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X86-NEXT:    movd %xmm2, %eax
+; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X86-NEXT:    movd %xmm2, %esi
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl %esi
-; X86-NEXT:    movd %eax, %xmm3
-; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X86-NEXT:    movd %eax, %xmm2
+; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    movd %xmm1, %esi
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl %esi
-; X86-NEXT:    movd %eax, %xmm2
+; X86-NEXT:    movd %eax, %xmm3
 ; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
 ; X86-NEXT:    movd %xmm4, %eax
 ; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
@@ -725,17 +721,17 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl %esi
 ; X86-NEXT:    movd %eax, %xmm4
-; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; X86-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X86-NEXT:    movdqa %xmm2, (%ecx)
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; X86-NEXT:    pmuludq %xmm1, %xmm2
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; X86-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; X86-NEXT:    movdqa %xmm3, (%ecx)
+; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-NEXT:    pmuludq %xmm1, %xmm3
+; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
 ; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-NEXT:    pmuludq %xmm3, %xmm1
+; X86-NEXT:    pmuludq %xmm2, %xmm1
 ; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-NEXT:    psubd %xmm2, %xmm0
+; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; X86-NEXT:    psubd %xmm3, %xmm0
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -817,25 +813,25 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst
 ; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; X86-NEXT:    movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    calll __udivdi3
-; X86-NEXT:    movd %edx, %xmm0
-; X86-NEXT:    movd %eax, %xmm1
-; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-NEXT:    movd %edx, %xmm1
+; X86-NEXT:    movd %eax, %xmm3
+; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
 ; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X86-NEXT:    movdqa %xmm1, (%esi)
-; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload
-; X86-NEXT:    movdqa %xmm3, %xmm0
-; X86-NEXT:    psrlq $32, %xmm0
-; X86-NEXT:    pmuludq %xmm1, %xmm0
-; X86-NEXT:    movdqa %xmm1, %xmm2
+; X86-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; X86-NEXT:    movdqa %xmm3, (%esi)
+; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psrlq $32, %xmm1
+; X86-NEXT:    pmuludq %xmm3, %xmm1
+; X86-NEXT:    movdqa %xmm3, %xmm2
 ; X86-NEXT:    psrlq $32, %xmm2
-; X86-NEXT:    pmuludq %xmm3, %xmm2
-; X86-NEXT:    paddq %xmm0, %xmm2
+; X86-NEXT:    pmuludq %xmm0, %xmm2
+; X86-NEXT:    paddq %xmm1, %xmm2
 ; X86-NEXT:    psllq $32, %xmm2
-; X86-NEXT:    pmuludq %xmm3, %xmm1
-; X86-NEXT:    paddq %xmm2, %xmm1
+; X86-NEXT:    pmuludq %xmm0, %xmm3
+; X86-NEXT:    paddq %xmm2, %xmm3
 ; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT:    psubq %xmm1, %xmm0
+; X86-NEXT:    psubq %xmm3, %xmm0
 ; X86-NEXT:    addl $64, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/fp128-cast.ll b/llvm/test/CodeGen/X86/fp128-cast.ll
index 2ad7a0210f87c..b2c7828f6e8ea 100644
--- a/llvm/test/CodeGen/X86/fp128-cast.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast.ll
@@ -1139,19 +1139,19 @@ define dso_local i32 @TestBits128(fp128 %ld) nounwind {
 ; X32-NEXT:    subl $20, %esp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    subl $12, %esp
-; X32-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    pushl %edx
 ; X32-NEXT:    pushl %ecx
 ; X32-NEXT:    pushl %eax
+; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    pushl %edx
 ; X32-NEXT:    pushl %ecx
 ; X32-NEXT:    pushl %eax
-; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %edx
 ; X32-NEXT:    calll __multf3
 ; X32-NEXT:    addl $44, %esp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx

diff  --git a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
index 7a8c3a78f9c32..bcfac18d6f4f4 100644
--- a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
+++ b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
@@ -971,21 +971,21 @@ define i128 @test_signed_i128_f32(float %f) nounwind {
 ; X86-SSE-NEXT:    cmovbl %ecx, %edx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SSE-NEXT:    cmovbl %ecx, %edi
-; X86-SSE-NEXT:    movl $-2147483648, %ebx # imm = 0x80000000
-; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT:    movl $-2147483648, %ebp # imm = 0x80000000
+; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %ebp
 ; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $2147483647, %ebp # imm = 0x7FFFFFFF
-; X86-SSE-NEXT:    cmovbel %ebx, %ebp
-; X86-SSE-NEXT:    movl $-1, %ebx
-; X86-SSE-NEXT:    cmoval %ebx, %edi
-; X86-SSE-NEXT:    cmoval %ebx, %edx
-; X86-SSE-NEXT:    cmoval %ebx, %eax
+; X86-SSE-NEXT:    movl $2147483647, %ebx # imm = 0x7FFFFFFF
+; X86-SSE-NEXT:    cmovbel %ebp, %ebx
+; X86-SSE-NEXT:    movl $-1, %ebp
+; X86-SSE-NEXT:    cmoval %ebp, %edi
+; X86-SSE-NEXT:    cmoval %ebp, %edx
+; X86-SSE-NEXT:    cmoval %ebp, %eax
 ; X86-SSE-NEXT:    ucomiss %xmm0, %xmm0
 ; X86-SSE-NEXT:    cmovpl %ecx, %eax
 ; X86-SSE-NEXT:    cmovpl %ecx, %edx
 ; X86-SSE-NEXT:    cmovpl %ecx, %edi
-; X86-SSE-NEXT:    cmovpl %ecx, %ebp
-; X86-SSE-NEXT:    movl %ebp, 12(%esi)
+; X86-SSE-NEXT:    cmovpl %ecx, %ebx
+; X86-SSE-NEXT:    movl %ebx, 12(%esi)
 ; X86-SSE-NEXT:    movl %edi, 8(%esi)
 ; X86-SSE-NEXT:    movl %edx, 4(%esi)
 ; X86-SSE-NEXT:    movl %eax, (%esi)
@@ -1985,21 +1985,21 @@ define i128 @test_signed_i128_f64(double %f) nounwind {
 ; X86-SSE-NEXT:    cmovbl %ecx, %edx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SSE-NEXT:    cmovbl %ecx, %edi
-; X86-SSE-NEXT:    movl $-2147483648, %ebx # imm = 0x80000000
-; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT:    movl $-2147483648, %ebp # imm = 0x80000000
+; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %ebp
 ; X86-SSE-NEXT:    ucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $2147483647, %ebp # imm = 0x7FFFFFFF
-; X86-SSE-NEXT:    cmovbel %ebx, %ebp
-; X86-SSE-NEXT:    movl $-1, %ebx
-; X86-SSE-NEXT:    cmoval %ebx, %edi
-; X86-SSE-NEXT:    cmoval %ebx, %edx
-; X86-SSE-NEXT:    cmoval %ebx, %eax
+; X86-SSE-NEXT:    movl $2147483647, %ebx # imm = 0x7FFFFFFF
+; X86-SSE-NEXT:    cmovbel %ebp, %ebx
+; X86-SSE-NEXT:    movl $-1, %ebp
+; X86-SSE-NEXT:    cmoval %ebp, %edi
+; X86-SSE-NEXT:    cmoval %ebp, %edx
+; X86-SSE-NEXT:    cmoval %ebp, %eax
 ; X86-SSE-NEXT:    ucomisd %xmm0, %xmm0
 ; X86-SSE-NEXT:    cmovpl %ecx, %eax
 ; X86-SSE-NEXT:    cmovpl %ecx, %edx
 ; X86-SSE-NEXT:    cmovpl %ecx, %edi
-; X86-SSE-NEXT:    cmovpl %ecx, %ebp
-; X86-SSE-NEXT:    movl %ebp, 12(%esi)
+; X86-SSE-NEXT:    cmovpl %ecx, %ebx
+; X86-SSE-NEXT:    movl %ebx, 12(%esi)
 ; X86-SSE-NEXT:    movl %edi, 8(%esi)
 ; X86-SSE-NEXT:    movl %edx, 4(%esi)
 ; X86-SSE-NEXT:    movl %eax, (%esi)
@@ -3115,21 +3115,21 @@ define i128 @test_signed_i128_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    cmovbl %ecx, %edx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SSE-NEXT:    cmovbl %ecx, %edi
-; X86-SSE-NEXT:    movl $-2147483648, %ebx # imm = 0x80000000
-; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT:    movl $-2147483648, %ebp # imm = 0x80000000
+; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %ebp
 ; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $2147483647, %ebp # imm = 0x7FFFFFFF
-; X86-SSE-NEXT:    cmovbel %ebx, %ebp
-; X86-SSE-NEXT:    movl $-1, %ebx
-; X86-SSE-NEXT:    cmoval %ebx, %edi
-; X86-SSE-NEXT:    cmoval %ebx, %edx
-; X86-SSE-NEXT:    cmoval %ebx, %eax
+; X86-SSE-NEXT:    movl $2147483647, %ebx # imm = 0x7FFFFFFF
+; X86-SSE-NEXT:    cmovbel %ebp, %ebx
+; X86-SSE-NEXT:    movl $-1, %ebp
+; X86-SSE-NEXT:    cmoval %ebp, %edi
+; X86-SSE-NEXT:    cmoval %ebp, %edx
+; X86-SSE-NEXT:    cmoval %ebp, %eax
 ; X86-SSE-NEXT:    ucomiss %xmm0, %xmm0
 ; X86-SSE-NEXT:    cmovpl %ecx, %eax
 ; X86-SSE-NEXT:    cmovpl %ecx, %edx
 ; X86-SSE-NEXT:    cmovpl %ecx, %edi
-; X86-SSE-NEXT:    cmovpl %ecx, %ebp
-; X86-SSE-NEXT:    movl %ebp, 12(%esi)
+; X86-SSE-NEXT:    cmovpl %ecx, %ebx
+; X86-SSE-NEXT:    movl %ebx, 12(%esi)
 ; X86-SSE-NEXT:    movl %edi, 8(%esi)
 ; X86-SSE-NEXT:    movl %edx, 4(%esi)
 ; X86-SSE-NEXT:    movl %eax, (%esi)
@@ -4459,24 +4459,24 @@ define i128 @test_signed_i128_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    cmovbl %ecx, %edx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SSE-NEXT:    cmovbl %ecx, %edi
-; X86-SSE-NEXT:    movl $-2147483648, %ebx # imm = 0x80000000
-; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT:    movl $-2147483648, %ebp # imm = 0x80000000
+; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %ebp
 ; X86-SSE-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucomi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(1)
-; X86-SSE-NEXT:    movl $2147483647, %ebp # imm = 0x7FFFFFFF
-; X86-SSE-NEXT:    cmovbel %ebx, %ebp
-; X86-SSE-NEXT:    movl $-1, %ebx
-; X86-SSE-NEXT:    cmoval %ebx, %edi
-; X86-SSE-NEXT:    cmoval %ebx, %edx
-; X86-SSE-NEXT:    cmoval %ebx, %eax
+; X86-SSE-NEXT:    movl $2147483647, %ebx # imm = 0x7FFFFFFF
+; X86-SSE-NEXT:    cmovbel %ebp, %ebx
+; X86-SSE-NEXT:    movl $-1, %ebp
+; X86-SSE-NEXT:    cmoval %ebp, %edi
+; X86-SSE-NEXT:    cmoval %ebp, %edx
+; X86-SSE-NEXT:    cmoval %ebp, %eax
 ; X86-SSE-NEXT:    fucompi %st(0), %st
 ; X86-SSE-NEXT:    cmovpl %ecx, %eax
 ; X86-SSE-NEXT:    cmovpl %ecx, %edx
 ; X86-SSE-NEXT:    cmovpl %ecx, %edi
-; X86-SSE-NEXT:    cmovpl %ecx, %ebp
-; X86-SSE-NEXT:    movl %ebp, 12(%esi)
+; X86-SSE-NEXT:    cmovpl %ecx, %ebx
+; X86-SSE-NEXT:    movl %ebx, 12(%esi)
 ; X86-SSE-NEXT:    movl %edi, 8(%esi)
 ; X86-SSE-NEXT:    movl %edx, 4(%esi)
 ; X86-SSE-NEXT:    movl %eax, (%esi)

diff  --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index 243c02328ead0..ccc2737588a4c 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -865,8 +865,8 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ;
 ; X64-SLOW-LABEL: var_shift_i128:
 ; X64-SLOW:       # %bb.0:
-; X64-SLOW-NEXT:    movq %rcx, %r10
-; X64-SLOW-NEXT:    movq %rdx, %r9
+; X64-SLOW-NEXT:    movq %rcx, %r9
+; X64-SLOW-NEXT:    movq %rdx, %r10
 ; X64-SLOW-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
 ; X64-SLOW-NEXT:    andq %rdi, %rax
 ; X64-SLOW-NEXT:    movl %r8d, %ecx
@@ -880,17 +880,17 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X64-SLOW-NEXT:    shlq %cl, %rdx
 ; X64-SLOW-NEXT:    orq %rax, %rdx
 ; X64-SLOW-NEXT:    movl %r8d, %ecx
-; X64-SLOW-NEXT:    shrq %cl, %r9
-; X64-SLOW-NEXT:    leaq (%r10,%r10), %rsi
+; X64-SLOW-NEXT:    shrq %cl, %r10
+; X64-SLOW-NEXT:    leaq (%r9,%r9), %rsi
 ; X64-SLOW-NEXT:    movl %r11d, %ecx
 ; X64-SLOW-NEXT:    shlq %cl, %rsi
-; X64-SLOW-NEXT:    orq %r9, %rsi
+; X64-SLOW-NEXT:    orq %r10, %rsi
 ; X64-SLOW-NEXT:    movl %r8d, %ecx
-; X64-SLOW-NEXT:    shrq %cl, %r10
+; X64-SLOW-NEXT:    shrq %cl, %r9
 ; X64-SLOW-NEXT:    xorl %eax, %eax
 ; X64-SLOW-NEXT:    testb $64, %r8b
-; X64-SLOW-NEXT:    cmovneq %r10, %rsi
-; X64-SLOW-NEXT:    cmovneq %rax, %r10
+; X64-SLOW-NEXT:    cmovneq %r9, %rsi
+; X64-SLOW-NEXT:    cmovneq %rax, %r9
 ; X64-SLOW-NEXT:    addq %rdi, %rdi
 ; X64-SLOW-NEXT:    movl %r11d, %ecx
 ; X64-SLOW-NEXT:    shlq %cl, %rdi
@@ -898,7 +898,7 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X64-SLOW-NEXT:    cmovneq %rdi, %rdx
 ; X64-SLOW-NEXT:    cmoveq %rdi, %rax
 ; X64-SLOW-NEXT:    orq %rsi, %rax
-; X64-SLOW-NEXT:    orq %r10, %rdx
+; X64-SLOW-NEXT:    orq %r9, %rdx
 ; X64-SLOW-NEXT:    retq
   %tmp = tail call i128 @llvm.fshr.i128(i128 %x, i128 %y, i128 %z)
   ret i128 %tmp

diff  --git a/llvm/test/CodeGen/X86/funnel-shift-rot.ll b/llvm/test/CodeGen/X86/funnel-shift-rot.ll
index a06037fc927b3..aaefb082cc8ca 100644
--- a/llvm/test/CodeGen/X86/funnel-shift-rot.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift-rot.ll
@@ -281,25 +281,25 @@ define i64 @rotr_i64(i64 %x, i64 %z) nounwind {
 ; X32-SSE2-NEXT:    pushl %edi
 ; X32-SSE2-NEXT:    pushl %esi
 ; X32-SSE2-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-SSE2-NEXT:    movl %edx, %edi
-; X32-SSE2-NEXT:    shrl %cl, %edi
-; X32-SSE2-NEXT:    movl %esi, %ebx
-; X32-SSE2-NEXT:    shrdl %cl, %edx, %ebx
+; X32-SSE2-NEXT:    movl %edx, %esi
+; X32-SSE2-NEXT:    shrl %cl, %esi
+; X32-SSE2-NEXT:    movl %ebx, %edi
+; X32-SSE2-NEXT:    shrdl %cl, %edx, %edi
 ; X32-SSE2-NEXT:    xorl %ebp, %ebp
 ; X32-SSE2-NEXT:    testb $32, %cl
-; X32-SSE2-NEXT:    cmovnel %edi, %ebx
-; X32-SSE2-NEXT:    cmovnel %ebp, %edi
+; X32-SSE2-NEXT:    cmovnel %esi, %edi
+; X32-SSE2-NEXT:    cmovnel %ebp, %esi
 ; X32-SSE2-NEXT:    negb %cl
-; X32-SSE2-NEXT:    movl %esi, %eax
+; X32-SSE2-NEXT:    movl %ebx, %eax
 ; X32-SSE2-NEXT:    shll %cl, %eax
-; X32-SSE2-NEXT:    shldl %cl, %esi, %edx
+; X32-SSE2-NEXT:    shldl %cl, %ebx, %edx
 ; X32-SSE2-NEXT:    testb $32, %cl
 ; X32-SSE2-NEXT:    cmovnel %eax, %edx
 ; X32-SSE2-NEXT:    cmovnel %ebp, %eax
-; X32-SSE2-NEXT:    orl %ebx, %eax
-; X32-SSE2-NEXT:    orl %edi, %edx
+; X32-SSE2-NEXT:    orl %edi, %eax
+; X32-SSE2-NEXT:    orl %esi, %edx
 ; X32-SSE2-NEXT:    popl %esi
 ; X32-SSE2-NEXT:    popl %edi
 ; X32-SSE2-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index 8775957f7b7d9..5435836eeaeb6 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -47,12 +47,11 @@ define i64 @fshl_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X32-SSE2-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X32-SSE2-NEXT:    movb %ch, %cl
 ; X32-SSE2-NEXT:    notb %cl
-; X32-SSE2-NEXT:    shrdl $1, %eax, %esi
-; X32-SSE2-NEXT:    movl %eax, %ebx
+; X32-SSE2-NEXT:    shrdl $1, %ebx, %esi
 ; X32-SSE2-NEXT:    shrl %ebx
 ; X32-SSE2-NEXT:    shrdl %cl, %ebx, %esi
 ; X32-SSE2-NEXT:    shrl %cl, %ebx
@@ -94,26 +93,25 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X32-SSE2-NEXT:    pushl %edi
 ; X32-SSE2-NEXT:    pushl %esi
 ; X32-SSE2-NEXT:    subl $64, %esp
-; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-SSE2-NEXT:    movl %edi, %esi
-; X32-SSE2-NEXT:    shldl $31, %ecx, %esi
-; X32-SSE2-NEXT:    notl %edx
-; X32-SSE2-NEXT:    andl $127, %edx
-; X32-SSE2-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-SSE2-NEXT:    movl %esi, %edi
+; X32-SSE2-NEXT:    shldl $31, %ecx, %edi
+; X32-SSE2-NEXT:    notl %ebx
+; X32-SSE2-NEXT:    andl $127, %ebx
 ; X32-SSE2-NEXT:    movb $64, %cl
-; X32-SSE2-NEXT:    subb %dl, %cl
-; X32-SSE2-NEXT:    shrl %edi
-; X32-SSE2-NEXT:    movl %edi, %ebx
-; X32-SSE2-NEXT:    shldl %cl, %esi, %ebx
-; X32-SSE2-NEXT:    movl %esi, %ebp
-; X32-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    subb %bl, %cl
+; X32-SSE2-NEXT:    shrl %esi
+; X32-SSE2-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-SSE2-NEXT:    shldl %cl, %edi, %esi
+; X32-SSE2-NEXT:    movl %edi, %ebp
+; X32-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-SSE2-NEXT:    shll %cl, %ebp
 ; X32-SSE2-NEXT:    xorl %eax, %eax
 ; X32-SSE2-NEXT:    testb $32, %cl
-; X32-SSE2-NEXT:    cmovnel %ebp, %ebx
-; X32-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    cmovnel %ebp, %esi
+; X32-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-SSE2-NEXT:    cmovnel %eax, %ebp
 ; X32-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -123,136 +121,138 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X32-SSE2-NEXT:    movl %eax, %ecx
 ; X32-SSE2-NEXT:    shldl %cl, %ebp, %edx
 ; X32-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-SSE2-NEXT:    movl %ebx, %ecx
 ; X32-SSE2-NEXT:    addb $-64, %cl
-; X32-SSE2-NEXT:    movl %esi, %edx
-; X32-SSE2-NEXT:    shrdl %cl, %edi, %edx
-; X32-SSE2-NEXT:    movl %edi, %esi
+; X32-SSE2-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X32-SSE2-NEXT:    shrdl %cl, %esi, %edi
 ; X32-SSE2-NEXT:    shrl %cl, %esi
 ; X32-SSE2-NEXT:    testb $32, %cl
-; X32-SSE2-NEXT:    cmovnel %esi, %edx
-; X32-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    cmovnel %esi, %edi
+; X32-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-SSE2-NEXT:    movl $0, %ecx
 ; X32-SSE2-NEXT:    cmovnel %ecx, %esi
 ; X32-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X32-SSE2-NEXT:    movl %eax, %ecx
-; X32-SSE2-NEXT:    shldl %cl, %esi, %ebx
-; X32-SSE2-NEXT:    movl %esi, %edx
+; X32-SSE2-NEXT:    shldl %cl, %edi, %esi
+; X32-SSE2-NEXT:    movl %edi, %edx
 ; X32-SSE2-NEXT:    shll %cl, %edx
 ; X32-SSE2-NEXT:    shll %cl, %ebp
 ; X32-SSE2-NEXT:    testb $32, %al
+; X32-SSE2-NEXT:    movl %eax, %ecx
 ; X32-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-SSE2-NEXT:    cmovnel %ebp, %ecx
-; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-SSE2-NEXT:    cmovnel %edx, %ebx
-; X32-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-SSE2-NEXT:    movl $0, %ecx
-; X32-SSE2-NEXT:    cmovnel %ecx, %ebp
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-SSE2-NEXT:    cmovnel %ebp, %eax
+; X32-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    cmovnel %edx, %esi
+; X32-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl $0, %eax
+; X32-SSE2-NEXT:    cmovnel %eax, %ebp
 ; X32-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-SSE2-NEXT:    cmovnel %ecx, %edx
-; X32-SSE2-NEXT:    xorl %ecx, %ecx
-; X32-SSE2-NEXT:    cmpl $64, %eax
-; X32-SSE2-NEXT:    cmovael %ecx, %edx
+; X32-SSE2-NEXT:    cmovnel %eax, %edx
+; X32-SSE2-NEXT:    xorl %eax, %eax
+; X32-SSE2-NEXT:    cmpl $64, %ecx
+; X32-SSE2-NEXT:    cmovael %eax, %edx
 ; X32-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-SSE2-NEXT:    shldl $31, %eax, %ebx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-SSE2-NEXT:    shldl $31, %eax, %ebp
+; X32-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-SSE2-NEXT:    shrdl $1, %eax, %esi
+; X32-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl %ebx, %ecx
 ; X32-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-SSE2-NEXT:    shrdl $1, %eax, %edx
-; X32-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-SSE2-NEXT:    shrdl %cl, %edi, %eax
-; X32-SSE2-NEXT:    shrl %cl, %edi
-; X32-SSE2-NEXT:    movl %edx, %ebp
-; X32-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
-; X32-SSE2-NEXT:    movl %ebx, %edx
+; X32-SSE2-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-SSE2-NEXT:    shrdl %cl, %edx, %eax
 ; X32-SSE2-NEXT:    shrl %cl, %edx
+; X32-SSE2-NEXT:    movl %esi, %ebx
+; X32-SSE2-NEXT:    shrdl %cl, %ebp, %ebx
+; X32-SSE2-NEXT:    movl %ebp, %esi
+; X32-SSE2-NEXT:    shrl %cl, %esi
 ; X32-SSE2-NEXT:    testb $32, %cl
-; X32-SSE2-NEXT:    cmovnel %edx, %ebp
-; X32-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-SSE2-NEXT:    cmovnel %edi, %eax
+; X32-SSE2-NEXT:    cmovnel %esi, %ebx
+; X32-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl %edx, %ecx
+; X32-SSE2-NEXT:    cmovnel %edx, %eax
 ; X32-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-SSE2-NEXT:    movl $0, %ebp
-; X32-SSE2-NEXT:    cmovnel %ebp, %edx
-; X32-SSE2-NEXT:    cmovnel %ebp, %edi
-; X32-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-SSE2-NEXT:    cmpl $64, %eax
+; X32-SSE2-NEXT:    movl $0, %eax
+; X32-SSE2-NEXT:    cmovnel %eax, %esi
+; X32-SSE2-NEXT:    cmovnel %eax, %ecx
+; X32-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-SSE2-NEXT:    cmpl $64, %ebx
 ; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-SSE2-NEXT:    cmovael %ebp, %ecx
+; X32-SSE2-NEXT:    cmovael %eax, %ecx
 ; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-SSE2-NEXT:    xorl %ebp, %ebp
 ; X32-SSE2-NEXT:    movb $64, %ch
-; X32-SSE2-NEXT:    subb %al, %ch
-; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-SSE2-NEXT:    subb %bl, %ch
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-SSE2-NEXT:    movb %ch, %cl
-; X32-SSE2-NEXT:    shrl %cl, %edi
-; X32-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    shrl %cl, %edx
+; X32-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-SSE2-NEXT:    testb $32, %ch
-; X32-SSE2-NEXT:    cmovnel %ebp, %edi
-; X32-SSE2-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-SSE2-NEXT:    movb %al, %cl
+; X32-SSE2-NEXT:    cmovnel %ebp, %edx
+; X32-SSE2-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-SSE2-NEXT:    movb %bl, %cl
 ; X32-SSE2-NEXT:    addb $-64, %cl
-; X32-SSE2-NEXT:    movl %esi, %ebp
+; X32-SSE2-NEXT:    movl %edi, %ebp
 ; X32-SSE2-NEXT:    shll %cl, %ebp
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT:    shldl %cl, %esi, %eax
+; X32-SSE2-NEXT:    shldl %cl, %edi, %eax
 ; X32-SSE2-NEXT:    testb $32, %cl
 ; X32-SSE2-NEXT:    cmovnel %ebp, %eax
-; X32-SSE2-NEXT:    cmpl $64, (%esp) # 4-byte Folded Reload
-; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-SSE2-NEXT:    movl $0, %esi
-; X32-SSE2-NEXT:    cmovael %esi, %ebx
-; X32-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-SSE2-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X32-SSE2-NEXT:    movl $0, %edi
+; X32-SSE2-NEXT:    cmovael %edi, %ebx
+; X32-SSE2-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-SSE2-NEXT:    cmpl $64, %ebx
-; X32-SSE2-NEXT:    cmovbl %edi, %eax
+; X32-SSE2-NEXT:    cmovbl %edx, %eax
 ; X32-SSE2-NEXT:    testb $32, %cl
-; X32-SSE2-NEXT:    movl $0, %esi
-; X32-SSE2-NEXT:    cmovnel %esi, %ebp
-; X32-SSE2-NEXT:    cmpl $64, (%esp) # 4-byte Folded Reload
-; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-SSE2-NEXT:    cmovael %esi, %edi
-; X32-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl $0, %edi
+; X32-SSE2-NEXT:    cmovnel %edi, %ebp
+; X32-SSE2-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-SSE2-NEXT:    cmovael %edi, %edx
+; X32-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-SSE2-NEXT:    movb %ch, %cl
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-SSE2-NEXT:    shrdl %cl, %edi, %esi
+; X32-SSE2-NEXT:    shrdl %cl, %edx, %edi
 ; X32-SSE2-NEXT:    testb $32, %ch
-; X32-SSE2-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-SSE2-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-SSE2-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-SSE2-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-SSE2-NEXT:    cmpl $64, %ebx
-; X32-SSE2-NEXT:    cmovael %ebp, %esi
-; X32-SSE2-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-SSE2-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-SSE2-NEXT:    cmpl $64, %edi
-; X32-SSE2-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-SSE2-NEXT:    cmovael %ebp, %edi
+; X32-SSE2-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-SSE2-NEXT:    cmpl $64, %edx
+; X32-SSE2-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-SSE2-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-SSE2-NEXT:    cmpl $64, %edi
+; X32-SSE2-NEXT:    cmpl $64, %edx
 ; X32-SSE2-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-SSE2-NEXT:    testl %edi, %edi
-; X32-SSE2-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-SSE2-NEXT:    testl %edx, %edx
+; X32-SSE2-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-SSE2-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-SSE2-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-SSE2-NEXT:    movl %ecx, %edi
-; X32-SSE2-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-SSE2-NEXT:    movl %ecx, %edx
+; X32-SSE2-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-SSE2-NEXT:    testl %ebx, %ebx
 ; X32-SSE2-NEXT:    cmovel {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT:    cmovel {{[0-9]+}}(%esp), %esi
-; X32-SSE2-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-SSE2-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-SSE2-NEXT:    cmovel {{[0-9]+}}(%esp), %edi
+; X32-SSE2-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-SSE2-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-SSE2-NEXT:    movl %eax, 12(%ecx)
-; X32-SSE2-NEXT:    movl %esi, 8(%ecx)
-; X32-SSE2-NEXT:    movl %edx, 4(%ecx)
-; X32-SSE2-NEXT:    movl %edi, (%ecx)
+; X32-SSE2-NEXT:    movl %edi, 8(%ecx)
+; X32-SSE2-NEXT:    movl %esi, 4(%ecx)
+; X32-SSE2-NEXT:    movl %edx, (%ecx)
 ; X32-SSE2-NEXT:    movl %ecx, %eax
 ; X32-SSE2-NEXT:    addl $64, %esp
 ; X32-SSE2-NEXT:    popl %esi
@@ -485,26 +485,27 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) nounwind {
 ; X32-SSE2-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X32-SSE2-NEXT:    calll __umoddi3
 ; X32-SSE2-NEXT:    addl $16, %esp
-; X32-SSE2-NEXT:    addb $27, %al
 ; X32-SSE2-NEXT:    movl %eax, %edx
-; X32-SSE2-NEXT:    notb %dl
-; X32-SSE2-NEXT:    movl %edx, %ecx
+; X32-SSE2-NEXT:    addb $27, %dl
+; X32-SSE2-NEXT:    movl %edx, %eax
+; X32-SSE2-NEXT:    notb %al
+; X32-SSE2-NEXT:    movl %eax, %ecx
 ; X32-SSE2-NEXT:    shldl %cl, %edi, %esi
 ; X32-SSE2-NEXT:    shldl $27, %ebp, %ebx
 ; X32-SSE2-NEXT:    shll $27, %ebp
-; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    movl %edx, %ecx
 ; X32-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
 ; X32-SSE2-NEXT:    shrl %cl, %ebx
 ; X32-SSE2-NEXT:    xorl %ecx, %ecx
-; X32-SSE2-NEXT:    testb $32, %al
+; X32-SSE2-NEXT:    testb $32, %dl
 ; X32-SSE2-NEXT:    cmovnel %ebx, %ebp
 ; X32-SSE2-NEXT:    cmovnel %ecx, %ebx
-; X32-SSE2-NEXT:    xorl %eax, %eax
-; X32-SSE2-NEXT:    movl %edx, %ecx
+; X32-SSE2-NEXT:    xorl %edx, %edx
+; X32-SSE2-NEXT:    movl %eax, %ecx
 ; X32-SSE2-NEXT:    shll %cl, %edi
-; X32-SSE2-NEXT:    testb $32, %dl
+; X32-SSE2-NEXT:    testb $32, %al
 ; X32-SSE2-NEXT:    cmovnel %edi, %esi
-; X32-SSE2-NEXT:    cmovnel %eax, %edi
+; X32-SSE2-NEXT:    cmovnel %edx, %edi
 ; X32-SSE2-NEXT:    orl %ebp, %edi
 ; X32-SSE2-NEXT:    orl %ebx, %esi
 ; X32-SSE2-NEXT:    movl %edi, %eax
@@ -1155,21 +1156,21 @@ define void @PR45265(i32 %0, %struct.S* nocapture readonly %1) nounwind {
 ; X32-SSE2-NEXT:    pushl %edi
 ; X32-SSE2-NEXT:    pushl %esi
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-SSE2-NEXT:    leal (%eax,%eax,2), %edx
-; X32-SSE2-NEXT:    movzwl 8(%ecx,%edx,4), %esi
-; X32-SSE2-NEXT:    movsbl 10(%ecx,%edx,4), %edi
-; X32-SSE2-NEXT:    movl %edi, %ebx
-; X32-SSE2-NEXT:    shll $16, %ebx
-; X32-SSE2-NEXT:    orl %esi, %ebx
-; X32-SSE2-NEXT:    movl 4(%ecx,%edx,4), %ecx
-; X32-SSE2-NEXT:    shrdl $8, %ebx, %ecx
-; X32-SSE2-NEXT:    xorl %eax, %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-SSE2-NEXT:    leal (%eax,%eax,2), %edi
+; X32-SSE2-NEXT:    movzwl 8(%esi,%edi,4), %ebx
+; X32-SSE2-NEXT:    movsbl 10(%esi,%edi,4), %ecx
+; X32-SSE2-NEXT:    movl %ecx, %edx
+; X32-SSE2-NEXT:    shll $16, %edx
+; X32-SSE2-NEXT:    orl %ebx, %edx
+; X32-SSE2-NEXT:    movl 4(%esi,%edi,4), %esi
+; X32-SSE2-NEXT:    shrdl $8, %edx, %esi
+; X32-SSE2-NEXT:    xorl %eax, %esi
 ; X32-SSE2-NEXT:    sarl $31, %eax
-; X32-SSE2-NEXT:    sarl $31, %edi
-; X32-SSE2-NEXT:    shldl $24, %ebx, %edi
-; X32-SSE2-NEXT:    xorl %eax, %edi
-; X32-SSE2-NEXT:    orl %edi, %ecx
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    shldl $24, %edx, %ecx
+; X32-SSE2-NEXT:    xorl %eax, %ecx
+; X32-SSE2-NEXT:    orl %ecx, %esi
 ; X32-SSE2-NEXT:    jne .LBB46_1
 ; X32-SSE2-NEXT:  # %bb.2:
 ; X32-SSE2-NEXT:    popl %esi

diff  --git a/llvm/test/CodeGen/X86/gather-addresses.ll b/llvm/test/CodeGen/X86/gather-addresses.ll
index 4cc2cf599fad2..c6bcaba3fad9b 100644
--- a/llvm/test/CodeGen/X86/gather-addresses.ll
+++ b/llvm/test/CodeGen/X86/gather-addresses.ll
@@ -229,19 +229,19 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind
 ; LIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; LIN32-NEXT:    movdqa (%edx), %xmm0
 ; LIN32-NEXT:    pand (%ecx), %xmm0
-; LIN32-NEXT:    movd %xmm0, %ecx
-; LIN32-NEXT:    pextrd $1, %xmm0, %edx
-; LIN32-NEXT:    pextrd $2, %xmm0, %esi
+; LIN32-NEXT:    movd %xmm0, %edx
+; LIN32-NEXT:    pextrd $1, %xmm0, %esi
+; LIN32-NEXT:    pextrd $2, %xmm0, %ecx
 ; LIN32-NEXT:    pextrd $3, %xmm0, %edi
-; LIN32-NEXT:    andl %eax, %ecx
 ; LIN32-NEXT:    andl %eax, %edx
 ; LIN32-NEXT:    andl %eax, %esi
+; LIN32-NEXT:    andl %eax, %ecx
 ; LIN32-NEXT:    andl %eax, %edi
-; LIN32-NEXT:    movd %edx, %xmm1
-; LIN32-NEXT:    movd %ecx, %xmm0
+; LIN32-NEXT:    movd %esi, %xmm1
+; LIN32-NEXT:    movd %edx, %xmm0
 ; LIN32-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; LIN32-NEXT:    movd %edi, %xmm2
-; LIN32-NEXT:    movd %esi, %xmm1
+; LIN32-NEXT:    movd %ecx, %xmm1
 ; LIN32-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; LIN32-NEXT:    popl %esi
 ; LIN32-NEXT:    popl %edi

diff  --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
index a8f41c2680f2f..86b20c249d8c4 100644
--- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+sse,sse2                  < %s | FileCheck %s --check-prefixes=CHECK,X86,SSE2,X86-SSE2,X86-BMI1
-; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+sse,sse2,+bmi             < %s | FileCheck %s --check-prefixes=CHECK,X86,SSE2,X86-SSE2,X86-BMI1
-; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+sse,sse2,+bmi,+bmi2       < %s | FileCheck %s --check-prefixes=CHECK,X86,SSE2,X86-SSE2,X86-BMI2
+; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+sse,sse2                  < %s | FileCheck %s --check-prefixes=CHECK,X86,X86-SSE2,X86-BMI1
+; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+sse,sse2,+bmi             < %s | FileCheck %s --check-prefixes=CHECK,X86,X86-SSE2,X86-BMI1
+; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+sse,sse2,+bmi,+bmi2       < %s | FileCheck %s --check-prefixes=CHECK,X86,X86-SSE2,X86-BMI2
 ; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+sse,sse2,+bmi,+bmi2,+avx2 < %s | FileCheck %s --check-prefixes=CHECK,X86,X86-BMI2,AVX2
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,sse2                  < %s | FileCheck %s --check-prefixes=CHECK,X64,SSE2,X64-SSE2,X64-BMI1
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,sse2,+bmi             < %s | FileCheck %s --check-prefixes=CHECK,X64,SSE2,X64-SSE2,X64-BMI1
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,sse2,+bmi,+bmi2       < %s | FileCheck %s --check-prefixes=CHECK,X64,SSE2,X64-SSE2,X64-BMI2
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,sse2                  < %s | FileCheck %s --check-prefixes=CHECK,X64,X64-SSE2,X64-BMI1
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,sse2,+bmi             < %s | FileCheck %s --check-prefixes=CHECK,X64,X64-SSE2,X64-BMI1
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,sse2,+bmi,+bmi2       < %s | FileCheck %s --check-prefixes=CHECK,X64,X64-SSE2,X64-BMI2
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,sse2,+bmi,+bmi2,+avx2 < %s | FileCheck %s --check-prefixes=CHECK,X64,X64-BMI2,AVX2
 
 ; We are looking for the following pattern here:
@@ -544,28 +544,28 @@ define <4 x i1> @vec_4xi32_splat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
 }
 
 define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
-; SSE2-LABEL: vec_4xi32_nonsplat_eq:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,16776960,2147483648]
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psrld %xmm2, %xmm4
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    psrld %xmm2, %xmm5
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psrld %xmm2, %xmm4
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE2-NEXT:    psrld %xmm1, %xmm3
-; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
-; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3]
-; SSE2-NEXT:    andps %xmm5, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE2-NEXT:    ret{{[l|q]}}
+; X86-SSE2-LABEL: vec_4xi32_nonsplat_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,16776960,2147483648]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    psrld %xmm3, %xmm4
+; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT:    psrld %xmm3, %xmm5
+; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    psrld %xmm3, %xmm4
+; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; X86-SSE2-NEXT:    psrld %xmm1, %xmm2
+; X86-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1]
+; X86-SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm2[0,3]
+; X86-SSE2-NEXT:    andps %xmm5, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; X86-SSE2-NEXT:    retl
 ;
 ; AVX2-LABEL: vec_4xi32_nonsplat_eq:
 ; AVX2:       # %bb.0:
@@ -575,6 +575,29 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    ret{{[l|q]}}
+;
+; X64-SSE2-LABEL: vec_4xi32_nonsplat_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,16776960,2147483648]
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT:    psrld %xmm2, %xmm4
+; X64-SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7]
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT:    psrld %xmm2, %xmm5
+; X64-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; X64-SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT:    psrld %xmm2, %xmm4
+; X64-SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; X64-SSE2-NEXT:    psrld %xmm1, %xmm3
+; X64-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
+; X64-SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3]
+; X64-SSE2-NEXT:    andps %xmm5, %xmm0
+; X64-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X64-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; X64-SSE2-NEXT:    retq
   %t0 = lshr <4 x i32> <i32 0, i32 1, i32 16776960, i32 2147483648>, %y
   %t1 = and <4 x i32> %t0, %x
   %res = icmp eq <4 x i32> %t1, <i32 0, i32 0, i32 0, i32 0>
@@ -630,28 +653,28 @@ define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwi
   ret <4 x i1> %res
 }
 define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
-; SSE2-LABEL: vec_4xi32_nonsplat_undef1_eq:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,1,1,1]
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psrld %xmm2, %xmm4
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    psrld %xmm2, %xmm5
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psrld %xmm2, %xmm4
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE2-NEXT:    psrld %xmm1, %xmm3
-; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
-; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3]
-; SSE2-NEXT:    andps %xmm5, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE2-NEXT:    ret{{[l|q]}}
+; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef1_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    psrld %xmm3, %xmm4
+; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT:    psrld %xmm3, %xmm5
+; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    psrld %xmm3, %xmm4
+; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; X86-SSE2-NEXT:    psrld %xmm1, %xmm2
+; X86-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1]
+; X86-SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm2[0,3]
+; X86-SSE2-NEXT:    andps %xmm5, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; X86-SSE2-NEXT:    retl
 ;
 ; AVX2-LABEL: vec_4xi32_nonsplat_undef1_eq:
 ; AVX2:       # %bb.0:
@@ -661,34 +684,57 @@ define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwi
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    ret{{[l|q]}}
+;
+; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef1_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,1,1,1]
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT:    psrld %xmm2, %xmm4
+; X64-SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7]
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT:    psrld %xmm2, %xmm5
+; X64-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; X64-SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT:    psrld %xmm2, %xmm4
+; X64-SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; X64-SSE2-NEXT:    psrld %xmm1, %xmm3
+; X64-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
+; X64-SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3]
+; X64-SSE2-NEXT:    andps %xmm5, %xmm0
+; X64-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X64-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; X64-SSE2-NEXT:    retq
   %t0 = lshr <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %y
   %t1 = and <4 x i32> %t0, %x
   %res = icmp eq <4 x i32> %t1, <i32 0, i32 0, i32 undef, i32 0>
   ret <4 x i1> %res
 }
 define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
-; SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = <1,1,u,1>
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psrld %xmm2, %xmm4
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    psrld %xmm2, %xmm5
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psrld %xmm2, %xmm4
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE2-NEXT:    psrld %xmm1, %xmm3
-; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
-; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3]
-; SSE2-NEXT:    andps %xmm5, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE2-NEXT:    ret{{[l|q]}}
+; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = <1,1,u,1>
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    psrld %xmm3, %xmm4
+; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT:    psrld %xmm3, %xmm5
+; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    psrld %xmm3, %xmm4
+; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; X86-SSE2-NEXT:    psrld %xmm1, %xmm2
+; X86-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1]
+; X86-SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm2[0,3]
+; X86-SSE2-NEXT:    andps %xmm5, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; X86-SSE2-NEXT:    retl
 ;
 ; AVX2-LABEL: vec_4xi32_nonsplat_undef2_eq:
 ; AVX2:       # %bb.0:
@@ -698,6 +744,29 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    ret{{[l|q]}}
+;
+; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = <1,1,u,1>
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT:    psrld %xmm2, %xmm4
+; X64-SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7]
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT:    psrld %xmm2, %xmm5
+; X64-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; X64-SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT:    psrld %xmm2, %xmm4
+; X64-SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; X64-SSE2-NEXT:    psrld %xmm1, %xmm3
+; X64-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
+; X64-SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3]
+; X64-SSE2-NEXT:    andps %xmm5, %xmm0
+; X64-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X64-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; X64-SSE2-NEXT:    retq
   %t0 = lshr <4 x i32> <i32 1, i32 1, i32 undef, i32 1>, %y
   %t1 = and <4 x i32> %t0, %x
   %res = icmp eq <4 x i32> %t1, <i32 0, i32 0, i32 undef, i32 0>

diff  --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
index 8b00fa8c2b64f..04c9cdd5b7cab 100644
--- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
@@ -346,16 +346,16 @@ define i1 @scalar_i64_lowestbit_eq(i64 %x, i64 %y) nounwind {
 ; X86-BMI1-NEXT:    pushl %esi
 ; X86-BMI1-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1-NEXT:    movl $1, %eax
-; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    xorl %esi, %esi
-; X86-BMI1-NEXT:    shldl %cl, %eax, %esi
+; X86-BMI1-NEXT:    xorl %edx, %edx
+; X86-BMI1-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1-NEXT:    shll %cl, %eax
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    cmovnel %eax, %esi
-; X86-BMI1-NEXT:    cmovnel %edx, %eax
-; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI1-NEXT:    cmovnel %eax, %edx
+; X86-BMI1-NEXT:    cmovnel %esi, %eax
+; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    orl %esi, %eax
+; X86-BMI1-NEXT:    orl %edx, %eax
 ; X86-BMI1-NEXT:    sete %al
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll
index 987de633c72f2..c19a77f2f5c2b 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll
@@ -279,27 +279,27 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
 ; X86-SSE2-NEXT:    por %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
+; X86-SSE2-NEXT:    por %xmm2, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    psrld $16, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    psrlw $8, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
 ; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
-; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    movd %xmm2, %eax
 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -412,28 +412,28 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
 ; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm6, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm3, %xmm4
-; X86-SSE2-NEXT:    pand %xmm4, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm4
-; X86-SSE2-NEXT:    por %xmm0, %xmm4
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm6, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm4, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
+; X86-SSE2-NEXT:    por %xmm0, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; X86-SSE2-NEXT:    pand %xmm5, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm4
+; X86-SSE2-NEXT:    pand %xmm2, %xmm3
 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm4, %xmm2
+; X86-SSE2-NEXT:    por %xmm3, %xmm2
 ; X86-SSE2-NEXT:    movd %xmm2, %eax
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
 ; X86-SSE2-NEXT:    movd %xmm0, %edx
@@ -844,19 +844,19 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
 ; X86-SSE2-NEXT:    por %xmm2, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE2-NEXT:    psrld $16, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
+; X86-SSE2-NEXT:    por %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    psrld $16, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE2-NEXT:    psrlw $8, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
@@ -1050,32 +1050,32 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
 ; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm2, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm7
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm7, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm3, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
-; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm5
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm5, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm0, %xmm5
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
+; X86-SSE2-NEXT:    por %xmm5, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm0, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm4
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
 ; X86-SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
@@ -1084,9 +1084,9 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm4, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movd %xmm2, %eax
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
 ; X86-SSE2-NEXT:    movd %xmm0, %edx
@@ -1628,27 +1628,27 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
 ; X86-SSE2-NEXT:    por %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
+; X86-SSE2-NEXT:    por %xmm2, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    psrld $16, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    psrlw $8, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
 ; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
-; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    movd %xmm2, %eax
 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -1997,27 +1997,27 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
 ; X86-SSE2-NEXT:    por %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
+; X86-SSE2-NEXT:    por %xmm2, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    psrld $16, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    psrlw $8, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
 ; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
-; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    movd %xmm2, %eax
 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -2125,27 +2125,27 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
 ; X86-SSE2-NEXT:    por %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
+; X86-SSE2-NEXT:    por %xmm2, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    psrld $16, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    psrlw $8, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
 ; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
-; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    movd %xmm2, %eax
 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;

diff  --git a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll
index 622c6fab8b107..8e5bc10fb6256 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll
@@ -281,27 +281,27 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
 ; X86-SSE2-NEXT:    por %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
+; X86-SSE2-NEXT:    por %xmm2, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    psrld $16, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    psrlw $8, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
 ; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
-; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    movd %xmm2, %eax
 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -414,28 +414,28 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
 ; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm6, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm3, %xmm4
-; X86-SSE2-NEXT:    pand %xmm4, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm4
-; X86-SSE2-NEXT:    por %xmm0, %xmm4
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm6, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm4, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
+; X86-SSE2-NEXT:    por %xmm0, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; X86-SSE2-NEXT:    pand %xmm5, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm4
+; X86-SSE2-NEXT:    pand %xmm2, %xmm3
 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm4, %xmm2
+; X86-SSE2-NEXT:    por %xmm3, %xmm2
 ; X86-SSE2-NEXT:    movd %xmm2, %eax
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
 ; X86-SSE2-NEXT:    movd %xmm0, %edx
@@ -848,19 +848,19 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
 ; X86-SSE2-NEXT:    por %xmm2, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE2-NEXT:    psrld $16, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
+; X86-SSE2-NEXT:    por %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    psrld $16, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE2-NEXT:    psrlw $8, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
@@ -1632,27 +1632,27 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
 ; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
+; X86-SSE2-NEXT:    por %xmm2, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    psrld $16, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    psrlw $8, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
 ; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
-; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    movd %xmm2, %eax
 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -2001,27 +2001,27 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
 ; X86-SSE2-NEXT:    por %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
+; X86-SSE2-NEXT:    por %xmm2, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    psrld $16, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    psrlw $8, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
 ; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
-; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    movd %xmm2, %eax
 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -2129,27 +2129,27 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
 ; X86-SSE2-NEXT:    por %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
+; X86-SSE2-NEXT:    por %xmm2, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    psrld $16, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    psrlw $8, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
 ; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
-; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    movd %xmm2, %eax
 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;

diff  --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
index daf5437bf98a4..6ae73457f768a 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
@@ -146,25 +146,25 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
 define i32 @test_reduce_v4i32(<4 x i32> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v4i32:
 ; X86-SSE2:       ## %bb.0:
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
-; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
-; X86-SSE2-NEXT:    pand %xmm3, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
-; X86-SSE2-NEXT:    por %xmm0, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm3
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm3, %xmm1
-; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm3
+; X86-SSE2-NEXT:    por %xmm1, %xmm3
+; X86-SSE2-NEXT:    movd %xmm3, %eax
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: test_reduce_v4i32:
@@ -476,28 +476,28 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
 ; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm6, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm3, %xmm4
-; X86-SSE2-NEXT:    pand %xmm4, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm4
-; X86-SSE2-NEXT:    por %xmm0, %xmm4
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm6, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm4, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
+; X86-SSE2-NEXT:    por %xmm0, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; X86-SSE2-NEXT:    pand %xmm5, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm4
+; X86-SSE2-NEXT:    pand %xmm2, %xmm3
 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm4, %xmm2
+; X86-SSE2-NEXT:    por %xmm3, %xmm2
 ; X86-SSE2-NEXT:    movd %xmm2, %eax
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
 ; X86-SSE2-NEXT:    movd %xmm0, %edx
@@ -670,31 +670,31 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v8i32:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
-; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
-; X86-SSE2-NEXT:    pand %xmm4, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm4
-; X86-SSE2-NEXT:    por %xmm0, %xmm4
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
-; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm4
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm4, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
+; X86-SSE2-NEXT:    por %xmm0, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm3
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
+; X86-SSE2-NEXT:    por %xmm3, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
-; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
-; X86-SSE2-NEXT:    pand %xmm3, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm3
-; X86-SSE2-NEXT:    por %xmm1, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
+; X86-SSE2-NEXT:    por %xmm0, %xmm3
 ; X86-SSE2-NEXT:    movd %xmm3, %eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -1145,32 +1145,32 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
 ; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm2, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm7
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm7, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm3, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
-; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm5
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm5, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm0, %xmm5
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
+; X86-SSE2-NEXT:    por %xmm5, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm0, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm4
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
 ; X86-SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
@@ -1179,9 +1179,9 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm4, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movd %xmm2, %eax
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
 ; X86-SSE2-NEXT:    movd %xmm0, %edx
@@ -1447,40 +1447,40 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm5, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm2, %xmm5
 ; X86-SSE2-NEXT:    por %xmm0, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm3, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm0
+; X86-SSE2-NEXT:    por %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pand %xmm1, %xmm5
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
 ; X86-SSE2-NEXT:    por %xmm5, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
-; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm0, %xmm4
-; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
-; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
+; X86-SSE2-NEXT:    por %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %eax
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: test_reduce_v16i32:

diff  --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
index 1c92fa033214e..c68ecea50a354 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
@@ -148,25 +148,25 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
 define i32 @test_reduce_v4i32(<4 x i32> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v4i32:
 ; X86-SSE2:       ## %bb.0:
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
-; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
-; X86-SSE2-NEXT:    pand %xmm4, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm4
-; X86-SSE2-NEXT:    por %xmm0, %xmm4
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm4
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm4, %xmm2
-; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm1
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: test_reduce_v4i32:
@@ -418,28 +418,28 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
 ; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm6, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm3, %xmm4
-; X86-SSE2-NEXT:    pand %xmm4, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm4
-; X86-SSE2-NEXT:    por %xmm0, %xmm4
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm6, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm4, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
+; X86-SSE2-NEXT:    por %xmm0, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; X86-SSE2-NEXT:    pand %xmm5, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm4
+; X86-SSE2-NEXT:    pand %xmm2, %xmm3
 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm4, %xmm2
+; X86-SSE2-NEXT:    por %xmm3, %xmm2
 ; X86-SSE2-NEXT:    movd %xmm2, %eax
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
 ; X86-SSE2-NEXT:    movd %xmm0, %edx
@@ -447,23 +447,23 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ;
 ; X86-SSE42-LABEL: test_reduce_v4i64:
 ; X86-SSE42:       ## %bb.0:
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm4
-; X86-SSE42-NEXT:    pxor %xmm3, %xmm4
+; X86-SSE42-NEXT:    pxor %xmm2, %xmm4
 ; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
 ; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm2, %xmm3
-; X86-SSE42-NEXT:    pcmpgtq %xmm0, %xmm3
-; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X86-SSE42-NEXT:    movd %xmm2, %eax
-; X86-SSE42-NEXT:    pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT:    pxor %xmm2, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm2
+; X86-SSE42-NEXT:    pcmpgtq %xmm0, %xmm2
+; X86-SSE42-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT:    movd %xmm3, %eax
+; X86-SSE42-NEXT:    pextrd $1, %xmm3, %edx
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: test_reduce_v4i64:
@@ -616,31 +616,31 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v8i32:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
+; X86-SSE2-NEXT:    por %xmm0, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
-; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
-; X86-SSE2-NEXT:    pand %xmm4, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm4
-; X86-SSE2-NEXT:    por %xmm0, %xmm4
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm3
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
+; X86-SSE2-NEXT:    por %xmm3, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
-; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
-; X86-SSE2-NEXT:    pand %xmm3, %xmm4
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm3
-; X86-SSE2-NEXT:    por %xmm4, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm3
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm3, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movd %xmm2, %eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -1103,32 +1103,32 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ;
 ; X86-SSE42-LABEL: test_reduce_v8i64:
 ; X86-SSE42:       ## %bb.0:
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm5
-; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm5 = [0,2147483648,0,2147483648]
 ; X86-SSE42-NEXT:    movdqa %xmm1, %xmm6
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm6
+; X86-SSE42-NEXT:    pxor %xmm5, %xmm6
 ; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
-; X86-SSE42-NEXT:    movdqa %xmm5, %xmm1
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE42-NEXT:    movdqa %xmm4, %xmm1
+; X86-SSE42-NEXT:    pxor %xmm5, %xmm1
 ; X86-SSE42-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm5, %xmm2
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; X86-SSE42-NEXT:    movapd %xmm2, %xmm1
-; X86-SSE42-NEXT:    xorpd %xmm4, %xmm1
+; X86-SSE42-NEXT:    xorpd %xmm5, %xmm1
 ; X86-SSE42-NEXT:    movapd %xmm3, %xmm0
-; X86-SSE42-NEXT:    xorpd %xmm4, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm5, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
 ; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm1, %xmm4
-; X86-SSE42-NEXT:    pcmpgtq %xmm0, %xmm4
-; X86-SSE42-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm5
+; X86-SSE42-NEXT:    pcmpgtq %xmm0, %xmm5
+; X86-SSE42-NEXT:    movdqa %xmm5, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
 ; X86-SSE42-NEXT:    movd %xmm1, %eax
 ; X86-SSE42-NEXT:    pextrd $1, %xmm1, %edx
@@ -1136,26 +1136,26 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ;
 ; X86-AVX1-LABEL: test_reduce_v8i64:
 ; X86-AVX1:       ## %bb.0:
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; X86-AVX1-NEXT:    ## xmm3 = mem[0,0]
-; X86-AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm4
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X86-AVX1-NEXT:    ## xmm2 = mem[0,0]
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm3, %xmm4
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; X86-AVX1-NEXT:    vxorps %xmm3, %xmm5, %xmm6
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm5, %xmm6
 ; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
-; X86-AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm6
-; X86-AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm7
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm6
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm7
 ; X86-AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
 ; X86-AVX1-NEXT:    vblendvpd %xmm6, %xmm0, %xmm1, %xmm0
-; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vblendvpd %xmm4, %xmm2, %xmm5, %xmm2
-; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm2, %xmm4
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm4, %xmm3, %xmm5, %xmm3
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm3, %xmm4
 ; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm1
-; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm2
-; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm1, %xmm3
-; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
+; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
 ; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
@@ -1357,47 +1357,47 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v16i32:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
-; X86-SSE2-NEXT:    pand %xmm6, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm3, %xmm6
-; X86-SSE2-NEXT:    por %xmm1, %xmm6
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
+; X86-SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; X86-SSE2-NEXT:    pand %xmm5, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm5
+; X86-SSE2-NEXT:    por %xmm1, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
-; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
-; X86-SSE2-NEXT:    pand %xmm3, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm3
-; X86-SSE2-NEXT:    por %xmm0, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm3
-; X86-SSE2-NEXT:    pandn %xmm6, %xmm1
-; X86-SSE2-NEXT:    por %xmm3, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm5, %xmm0
+; X86-SSE2-NEXT:    por %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
-; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
-; X86-SSE2-NEXT:    pand %xmm3, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm3
-; X86-SSE2-NEXT:    por %xmm1, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm0, %xmm4
-; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
-; X86-SSE2-NEXT:    pand %xmm4, %xmm3
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; X86-SSE2-NEXT:    pand %xmm4, %xmm1
 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm4
-; X86-SSE2-NEXT:    por %xmm3, %xmm4
+; X86-SSE2-NEXT:    por %xmm1, %xmm4
 ; X86-SSE2-NEXT:    movd %xmm4, %eax
 ; X86-SSE2-NEXT:    retl
 ;

diff  --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll
index 1142c31b69ec9..3b6d571cddf04 100644
--- a/llvm/test/CodeGen/X86/i128-mul.ll
+++ b/llvm/test/CodeGen/X86/i128-mul.ll
@@ -13,11 +13,11 @@ define i64 @foo(i64 %x, i64 %y) nounwind {
 ; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl %ecx, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %esi, %eax
 ; X86-NOBMI-NEXT:    mull %ebx
 ; X86-NOBMI-NEXT:    movl %edx, %edi
 ; X86-NOBMI-NEXT:    movl %ebp, %eax
@@ -26,16 +26,16 @@ define i64 @foo(i64 %x, i64 %y) nounwind {
 ; X86-NOBMI-NEXT:    movl %eax, %ebp
 ; X86-NOBMI-NEXT:    addl %edi, %ebp
 ; X86-NOBMI-NEXT:    adcl $0, %ebx
-; X86-NOBMI-NEXT:    movl %ecx, %eax
-; X86-NOBMI-NEXT:    mull %esi
-; X86-NOBMI-NEXT:    movl %edx, %ecx
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    mull %ecx
+; X86-NOBMI-NEXT:    movl %edx, %esi
 ; X86-NOBMI-NEXT:    addl %ebp, %eax
-; X86-NOBMI-NEXT:    adcl %ebx, %ecx
+; X86-NOBMI-NEXT:    adcl %ebx, %esi
 ; X86-NOBMI-NEXT:    setb %al
 ; X86-NOBMI-NEXT:    movzbl %al, %edi
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    mull %esi
-; X86-NOBMI-NEXT:    addl %ecx, %eax
+; X86-NOBMI-NEXT:    mull %ecx
+; X86-NOBMI-NEXT:    addl %esi, %eax
 ; X86-NOBMI-NEXT:    adcl %edi, %edx
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
@@ -49,26 +49,26 @@ define i64 @foo(i64 %x, i64 %y) nounwind {
 ; X86-BMI-NEXT:    pushl %ebx
 ; X86-BMI-NEXT:    pushl %edi
 ; X86-BMI-NEXT:    pushl %esi
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI-NEXT:    movl %eax, %edx
-; X86-BMI-NEXT:    mulxl %esi, %ebx, %ebx
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI-NEXT:    movl %ecx, %edx
-; X86-BMI-NEXT:    mulxl %esi, %esi, %ebp
-; X86-BMI-NEXT:    addl %ebx, %esi
-; X86-BMI-NEXT:    adcl $0, %ebp
+; X86-BMI-NEXT:    mulxl %edi, %ebx, %ebx
 ; X86-BMI-NEXT:    movl %eax, %edx
-; X86-BMI-NEXT:    mulxl %edi, %eax, %ebx
-; X86-BMI-NEXT:    addl %esi, %eax
-; X86-BMI-NEXT:    adcl %ebp, %ebx
-; X86-BMI-NEXT:    setb %al
-; X86-BMI-NEXT:    movzbl %al, %esi
+; X86-BMI-NEXT:    mulxl %edi, %edi, %ebp
+; X86-BMI-NEXT:    addl %ebx, %edi
+; X86-BMI-NEXT:    adcl $0, %ebp
 ; X86-BMI-NEXT:    movl %ecx, %edx
-; X86-BMI-NEXT:    mulxl %edi, %eax, %edx
+; X86-BMI-NEXT:    mulxl %esi, %ecx, %ebx
+; X86-BMI-NEXT:    addl %edi, %ecx
+; X86-BMI-NEXT:    adcl %ebp, %ebx
+; X86-BMI-NEXT:    setb %cl
+; X86-BMI-NEXT:    movzbl %cl, %ecx
+; X86-BMI-NEXT:    movl %eax, %edx
+; X86-BMI-NEXT:    mulxl %esi, %eax, %edx
 ; X86-BMI-NEXT:    addl %ebx, %eax
-; X86-BMI-NEXT:    adcl %esi, %edx
+; X86-BMI-NEXT:    adcl %ecx, %edx
 ; X86-BMI-NEXT:    popl %esi
 ; X86-BMI-NEXT:    popl %edi
 ; X86-BMI-NEXT:    popl %ebx
@@ -114,7 +114,7 @@ define i64 @mul1(i64 %n, i64* nocapture %z, i64* nocapture %x, i64 %y) nounwind
 ; X86-NOBMI-NEXT:  # %bb.1: # %for.body.preheader
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
 ; X86-NOBMI-NEXT:    xorl %edx, %edx
-; X86-NOBMI-NEXT:    xorl %ebp, %ebp
+; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
 ; X86-NOBMI-NEXT:    .p2align 4, 0x90
 ; X86-NOBMI-NEXT:  .LBB1_2: # %for.body
@@ -122,49 +122,51 @@ define i64 @mul1(i64 %n, i64* nocapture %z, i64* nocapture %x, i64 %y) nounwind
 ; X86-NOBMI-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOBMI-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax,%ebp,8), %esi
-; X86-NOBMI-NEXT:    movl 4(%eax,%ebp,8), %ecx
-; X86-NOBMI-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOBMI-NEXT:    movl (%eax,%ecx,8), %esi
+; X86-NOBMI-NEXT:    movl 4(%eax,%ecx,8), %ebx
+; X86-NOBMI-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOBMI-NEXT:    movl %esi, %eax
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NOBMI-NEXT:    mull %edi
 ; X86-NOBMI-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOBMI-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOBMI-NEXT:    movl %ecx, %eax
+; X86-NOBMI-NEXT:    movl %ebx, %eax
 ; X86-NOBMI-NEXT:    mull %edi
-; X86-NOBMI-NEXT:    movl %edx, %ecx
+; X86-NOBMI-NEXT:    movl %edx, %ebp
 ; X86-NOBMI-NEXT:    movl %eax, %ebx
 ; X86-NOBMI-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NOBMI-NEXT:    adcl $0, %ecx
+; X86-NOBMI-NEXT:    adcl $0, %ebp
 ; X86-NOBMI-NEXT:    movl %esi, %eax
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    mull %edx
-; X86-NOBMI-NEXT:    movl %edx, %esi
-; X86-NOBMI-NEXT:    movl %eax, %edi
-; X86-NOBMI-NEXT:    addl %ebx, %edi
-; X86-NOBMI-NEXT:    adcl %ecx, %esi
+; X86-NOBMI-NEXT:    movl %edx, %edi
+; X86-NOBMI-NEXT:    movl %eax, %esi
+; X86-NOBMI-NEXT:    addl %ebx, %esi
+; X86-NOBMI-NEXT:    adcl %ebp, %edi
 ; X86-NOBMI-NEXT:    setb %bl
 ; X86-NOBMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NOBMI-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NOBMI-NEXT:    addl %esi, %eax
-; X86-NOBMI-NEXT:    movzbl %bl, %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOBMI-NEXT:    adcl %esi, %edx
+; X86-NOBMI-NEXT:    addl %edi, %eax
+; X86-NOBMI-NEXT:    movzbl %bl, %edi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NOBMI-NEXT:    adcl %edi, %edx
+; X86-NOBMI-NEXT:    movl %ecx, %ebx
 ; X86-NOBMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NOBMI-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NOBMI-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NOBMI-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NOBMI-NEXT:    adcl $0, %eax
 ; X86-NOBMI-NEXT:    adcl $0, %edx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl %ecx, (%esi,%ebp,8)
-; X86-NOBMI-NEXT:    movl %edi, 4(%esi,%ebp,8)
-; X86-NOBMI-NEXT:    addl $1, %ebp
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    movl %ecx, (%edi,%ebx,8)
+; X86-NOBMI-NEXT:    movl %ebx, %ecx
+; X86-NOBMI-NEXT:    movl %esi, 4(%edi,%ebx,8)
+; X86-NOBMI-NEXT:    addl $1, %ecx
 ; X86-NOBMI-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; X86-NOBMI-NEXT:    adcl $0, %edi
-; X86-NOBMI-NEXT:    movl %ebp, %esi
+; X86-NOBMI-NEXT:    movl %ecx, %esi
 ; X86-NOBMI-NEXT:    xorl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NOBMI-NEXT:    xorl %ebx, %edi
+; X86-NOBMI-NEXT:    xorl %ebp, %edi
 ; X86-NOBMI-NEXT:    orl %esi, %edi
 ; X86-NOBMI-NEXT:    jne .LBB1_2
 ; X86-NOBMI-NEXT:  .LBB1_3: # %for.end
@@ -183,64 +185,71 @@ define i64 @mul1(i64 %n, i64* nocapture %z, i64* nocapture %x, i64 %y) nounwind
 ; X86-BMI-NEXT:    pushl %ebx
 ; X86-BMI-NEXT:    pushl %edi
 ; X86-BMI-NEXT:    pushl %esi
-; X86-BMI-NEXT:    subl $16, %esp
+; X86-BMI-NEXT:    subl $20, %esp
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI-NEXT:    orl %ecx, %eax
 ; X86-BMI-NEXT:    je .LBB1_3
 ; X86-BMI-NEXT:  # %bb.1: # %for.body.preheader
 ; X86-BMI-NEXT:    xorl %ecx, %ecx
-; X86-BMI-NEXT:    xorl %edx, %edx
+; X86-BMI-NEXT:    xorl %eax, %eax
 ; X86-BMI-NEXT:    xorl %ebx, %ebx
 ; X86-BMI-NEXT:    xorl %ebp, %ebp
 ; X86-BMI-NEXT:    .p2align 4, 0x90
 ; X86-BMI-NEXT:  .LBB1_2: # %for.body
 ; X86-BMI-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-BMI-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-BMI-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT:    movl (%ecx,%ebx,8), %eax
-; X86-BMI-NEXT:    movl 4(%ecx,%ebx,8), %esi
-; X86-BMI-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-BMI-NEXT:    movl %eax, %edx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT:    mulxl %ecx, %edx, %edi
+; X86-BMI-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-BMI-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT:    movl (%eax,%ebx,8), %ecx
+; X86-BMI-NEXT:    movl 4(%eax,%ebx,8), %esi
+; X86-BMI-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT:    movl %ecx, %edx
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT:    mulxl %eax, %edx, %edi
 ; X86-BMI-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-BMI-NEXT:    movl %esi, %edx
-; X86-BMI-NEXT:    mulxl %ecx, %esi, %ecx
-; X86-BMI-NEXT:    addl %edi, %esi
-; X86-BMI-NEXT:    adcl $0, %ecx
+; X86-BMI-NEXT:    mulxl %eax, %eax, %esi
+; X86-BMI-NEXT:    addl %edi, %eax
+; X86-BMI-NEXT:    adcl $0, %esi
+; X86-BMI-NEXT:    movl %ecx, %edx
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    mulxl %ecx, %edi, %ebp
+; X86-BMI-NEXT:    addl %eax, %edi
+; X86-BMI-NEXT:    adcl %esi, %ebp
+; X86-BMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-BMI-NEXT:    mulxl %ecx, %ecx, %eax
+; X86-BMI-NEXT:    setb %dl
+; X86-BMI-NEXT:    addl %ebp, %ecx
+; X86-BMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT:    movzbl %dl, %edx
+; X86-BMI-NEXT:    adcl %edx, %eax
 ; X86-BMI-NEXT:    movl %eax, %edx
-; X86-BMI-NEXT:    mulxl {{[0-9]+}}(%esp), %edi, %eax
-; X86-BMI-NEXT:    addl %esi, %edi
-; X86-BMI-NEXT:    adcl %ecx, %eax
-; X86-BMI-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-BMI-NEXT:    mulxl {{[0-9]+}}(%esp), %ecx, %edx
-; X86-BMI-NEXT:    setb (%esp) # 1-byte Folded Spill
-; X86-BMI-NEXT:    addl %eax, %ecx
-; X86-BMI-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
-; X86-BMI-NEXT:    adcl %eax, %edx
-; X86-BMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-BMI-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-BMI-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-BMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-BMI-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-BMI-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
 ; X86-BMI-NEXT:    adcl $0, %ecx
 ; X86-BMI-NEXT:    adcl $0, %edx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT:    movl %esi, (%eax,%ebx,8)
-; X86-BMI-NEXT:    movl %edi, 4(%eax,%ebx,8)
+; X86-BMI-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT:    movl %eax, (%edx,%ebx,8)
+; X86-BMI-NEXT:    movl %edi, 4(%edx,%ebx,8)
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-BMI-NEXT:    addl $1, %ebx
 ; X86-BMI-NEXT:    adcl $0, %ebp
-; X86-BMI-NEXT:    movl %ebx, %eax
-; X86-BMI-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT:    movl %ebx, %edx
+; X86-BMI-NEXT:    xorl %esi, %edx
 ; X86-BMI-NEXT:    movl %ebp, %esi
 ; X86-BMI-NEXT:    xorl %edi, %esi
-; X86-BMI-NEXT:    orl %eax, %esi
+; X86-BMI-NEXT:    orl %edx, %esi
+; X86-BMI-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-BMI-NEXT:    jne .LBB1_2
 ; X86-BMI-NEXT:  .LBB1_3: # %for.end
 ; X86-BMI-NEXT:    xorl %eax, %eax
 ; X86-BMI-NEXT:    xorl %edx, %edx
-; X86-BMI-NEXT:    addl $16, %esp
+; X86-BMI-NEXT:    addl $20, %esp
 ; X86-BMI-NEXT:    popl %esi
 ; X86-BMI-NEXT:    popl %edi
 ; X86-BMI-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/i128-sdiv.ll b/llvm/test/CodeGen/X86/i128-sdiv.ll
index f583af498ba65..5e0c79a229794 100644
--- a/llvm/test/CodeGen/X86/i128-sdiv.ll
+++ b/llvm/test/CodeGen/X86/i128-sdiv.ll
@@ -12,23 +12,23 @@ define i128 @test1(i128 %x) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shrl $30, %esi
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    shrl $30, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    shrdl $2, %ecx, %esi
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $2, %edx
+; X86-NEXT:    shrdl $2, %ecx, %edx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    sarl $2, %esi
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
@@ -56,30 +56,30 @@ define i128 @test2(i128 %x) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shrl $30, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    shrl $30, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    shrdl $2, %ecx, %esi
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    sarl $2, %ecx
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    shrdl $2, %edx, %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    sarl $2, %edx
 ; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    negl %esi
+; X86-NEXT:    negl %ecx
 ; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ecx, %ebx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %edi, 12(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi

diff  --git a/llvm/test/CodeGen/X86/i256-add.ll b/llvm/test/CodeGen/X86/i256-add.ll
index 23973bca7d562..3118aae568220 100644
--- a/llvm/test/CodeGen/X86/i256-add.ll
+++ b/llvm/test/CodeGen/X86/i256-add.ll
@@ -10,20 +10,20 @@ define void @add(i256* %p, i256* %q) nounwind {
 ; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %esi
 ; X32-NEXT:    subl $8, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl 28(%ecx), %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 24(%ecx), %eax
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl 20(%ecx), %esi
+; X32-NEXT:    movl 16(%ecx), %edi
+; X32-NEXT:    movl 12(%ecx), %ebx
+; X32-NEXT:    movl 8(%ecx), %ebp
+; X32-NEXT:    movl (%ecx), %edx
+; X32-NEXT:    movl 4(%ecx), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 28(%eax), %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl 24(%eax), %ecx
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-NEXT:    movl 20(%eax), %esi
-; X32-NEXT:    movl 16(%eax), %edi
-; X32-NEXT:    movl 12(%eax), %ebx
-; X32-NEXT:    movl 8(%eax), %ebp
-; X32-NEXT:    movl (%eax), %ecx
-; X32-NEXT:    movl 4(%eax), %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    addl %ecx, (%eax)
-; X32-NEXT:    adcl %edx, 4(%eax)
+; X32-NEXT:    addl %edx, (%eax)
+; X32-NEXT:    adcl %ecx, 4(%eax)
 ; X32-NEXT:    adcl %ebp, 8(%eax)
 ; X32-NEXT:    adcl %ebx, 12(%eax)
 ; X32-NEXT:    adcl %edi, 16(%eax)
@@ -64,20 +64,20 @@ define void @sub(i256* %p, i256* %q) nounwind {
 ; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %esi
 ; X32-NEXT:    subl $8, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl 28(%ecx), %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 24(%ecx), %eax
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl 20(%ecx), %esi
+; X32-NEXT:    movl 16(%ecx), %edi
+; X32-NEXT:    movl 12(%ecx), %ebx
+; X32-NEXT:    movl 8(%ecx), %ebp
+; X32-NEXT:    movl (%ecx), %edx
+; X32-NEXT:    movl 4(%ecx), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 28(%eax), %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl 24(%eax), %ecx
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-NEXT:    movl 20(%eax), %esi
-; X32-NEXT:    movl 16(%eax), %edi
-; X32-NEXT:    movl 12(%eax), %ebx
-; X32-NEXT:    movl 8(%eax), %ebp
-; X32-NEXT:    movl (%eax), %ecx
-; X32-NEXT:    movl 4(%eax), %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    subl %ecx, (%eax)
-; X32-NEXT:    sbbl %edx, 4(%eax)
+; X32-NEXT:    subl %edx, (%eax)
+; X32-NEXT:    sbbl %ecx, 4(%eax)
 ; X32-NEXT:    sbbl %ebp, 8(%eax)
 ; X32-NEXT:    sbbl %ebx, 12(%eax)
 ; X32-NEXT:    sbbl %edi, 16(%eax)

diff  --git a/llvm/test/CodeGen/X86/i64-to-float.ll b/llvm/test/CodeGen/X86/i64-to-float.ll
index 21d3e3a5589b8..bedc27e6d6491 100644
--- a/llvm/test/CodeGen/X86/i64-to-float.ll
+++ b/llvm/test/CodeGen/X86/i64-to-float.ll
@@ -265,26 +265,26 @@ define <2 x double> @clamp_sitofp_2i64_2f64(<2 x i64> %a) nounwind {
 ; X86-SSE-NEXT:    pcmpgtd %xmm3, %xmm4
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; X86-SSE-NEXT:    pcmpeqd %xmm3, %xmm2
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; X86-SSE-NEXT:    pand %xmm5, %xmm2
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; X86-SSE-NEXT:    por %xmm2, %xmm3
-; X86-SSE-NEXT:    pand %xmm3, %xmm0
-; X86-SSE-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
-; X86-SSE-NEXT:    por %xmm0, %xmm3
-; X86-SSE-NEXT:    pxor %xmm3, %xmm1
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; X86-SSE-NEXT:    pand %xmm5, %xmm3
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; X86-SSE-NEXT:    por %xmm3, %xmm2
+; X86-SSE-NEXT:    pand %xmm2, %xmm0
+; X86-SSE-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE-NEXT:    por %xmm0, %xmm2
+; X86-SSE-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [2147483903,0,2147483903,0]
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE-NEXT:    pcmpgtd %xmm1, %xmm2
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE-NEXT:    pcmpgtd %xmm1, %xmm3
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
 ; X86-SSE-NEXT:    pcmpeqd %xmm0, %xmm1
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
 ; X86-SSE-NEXT:    pand %xmm4, %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
 ; X86-SSE-NEXT:    por %xmm0, %xmm1
-; X86-SSE-NEXT:    pand %xmm1, %xmm3
+; X86-SSE-NEXT:    pand %xmm1, %xmm2
 ; X86-SSE-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE-NEXT:    por %xmm3, %xmm1
+; X86-SSE-NEXT:    por %xmm2, %xmm1
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; X86-SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
 ; X86-SSE-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
index 24462d4e11c01..e86ac7045491a 100644
--- a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
+++ b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
@@ -34,15 +34,15 @@ define void @i24_or(i24* %a) {
 define void @i24_and_or(i24* %a) {
 ; X86-LABEL: i24_and_or:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %edx
-; X86-NEXT:    movzbl 2(%ecx), %eax
-; X86-NEXT:    movb %al, 2(%ecx)
-; X86-NEXT:    shll $16, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    orl $384, %eax # imm = 0x180
-; X86-NEXT:    andl $16777088, %eax # imm = 0xFFFF80
-; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    movzbl 2(%eax), %ecx
+; X86-NEXT:    movb %cl, 2(%eax)
+; X86-NEXT:    shll $16, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    orl $384, %ecx # imm = 0x180
+; X86-NEXT:    andl $16777088, %ecx # imm = 0xFFFF80
+; X86-NEXT:    movw %cx, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: i24_and_or:
@@ -66,21 +66,21 @@ define void @i24_and_or(i24* %a) {
 define void @i24_insert_bit(i24* %a, i1 zeroext %bit) {
 ; X86-LABEL: i24_insert_bit:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzwl (%ecx), %esi
-; X86-NEXT:    movzbl 2(%ecx), %eax
-; X86-NEXT:    movb %al, 2(%ecx)
-; X86-NEXT:    shll $16, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    shll $13, %edx
-; X86-NEXT:    andl $16769023, %eax # imm = 0xFFDFFF
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movw %ax, (%ecx)
-; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_offset %ebx, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    movzbl 2(%eax), %ebx
+; X86-NEXT:    movb %bl, 2(%eax)
+; X86-NEXT:    shll $16, %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    shll $13, %ecx
+; X86-NEXT:    andl $16769023, %ebx # imm = 0xFFDFFF
+; X86-NEXT:    orl %ecx, %ebx
+; X86-NEXT:    movw %bx, (%eax)
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 ;

diff  --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll
index e18819ae11f21..acd0c4eba7723 100644
--- a/llvm/test/CodeGen/X86/known-signbits-vector.ll
+++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll
@@ -429,8 +429,8 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    vpmovsxdq 8(%ebp), %xmm3
-; X86-NEXT:    vpmovsxdq 16(%ebp), %xmm4
+; X86-NEXT:    vpmovsxdq 8(%ebp), %xmm4
+; X86-NEXT:    vpmovsxdq 16(%ebp), %xmm3
 ; X86-NEXT:    vpsrad $31, %xmm2, %xmm5
 ; X86-NEXT:    vpsrad $1, %xmm2, %xmm6
 ; X86-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
@@ -441,12 +441,12 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
 ; X86-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; X86-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3],xmm2[4,5],xmm6[6,7]
 ; X86-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm6
-; X86-NEXT:    vblendvpd %xmm6, %xmm5, %xmm3, %xmm3
+; X86-NEXT:    vblendvpd %xmm6, %xmm5, %xmm4, %xmm4
 ; X86-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; X86-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vblendvpd %xmm0, %xmm2, %xmm4, %xmm0
-; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; X86-NEXT:    vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
+; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
 ; X86-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]

diff  --git a/llvm/test/CodeGen/X86/legalize-shl-vec.ll b/llvm/test/CodeGen/X86/legalize-shl-vec.ll
index f85d2f759c1b8..845fc60687df6 100644
--- a/llvm/test/CodeGen/X86/legalize-shl-vec.ll
+++ b/llvm/test/CodeGen/X86/legalize-shl-vec.ll
@@ -6,30 +6,30 @@ define <2 x i256> @test_shl(<2 x i256> %In) {
 ; X32-LABEL: test_shl:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    shldl $2, %edx, %ecx
-; X32-NEXT:    movl %ecx, 60(%eax)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    shldl $2, %ecx, %edx
-; X32-NEXT:    movl %edx, 56(%eax)
+; X32-NEXT:    movl %edx, 60(%eax)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    shldl $2, %edx, %ecx
-; X32-NEXT:    movl %ecx, 52(%eax)
+; X32-NEXT:    movl %ecx, 56(%eax)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    shldl $2, %ecx, %edx
-; X32-NEXT:    movl %edx, 48(%eax)
+; X32-NEXT:    movl %edx, 52(%eax)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    shldl $2, %edx, %ecx
-; X32-NEXT:    movl %ecx, 44(%eax)
+; X32-NEXT:    movl %ecx, 48(%eax)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    shldl $2, %ecx, %edx
-; X32-NEXT:    movl %edx, 40(%eax)
+; X32-NEXT:    movl %edx, 44(%eax)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    shldl $2, %edx, %ecx
-; X32-NEXT:    movl %ecx, 36(%eax)
-; X32-NEXT:    shll $2, %edx
-; X32-NEXT:    movl %edx, 32(%eax)
+; X32-NEXT:    movl %ecx, 40(%eax)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    shldl $2, %ecx, %edx
+; X32-NEXT:    movl %edx, 36(%eax)
+; X32-NEXT:    shll $2, %ecx
+; X32-NEXT:    movl %ecx, 32(%eax)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    shll $31, %ecx
 ; X32-NEXT:    movl %ecx, 28(%eax)
@@ -84,34 +84,34 @@ define <2 x i256> @test_srl(<2 x i256> %In) {
 ; X32-NEXT:    .cfi_offset %edi, -16
 ; X32-NEXT:    .cfi_offset %ebx, -12
 ; X32-NEXT:    .cfi_offset %ebp, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    shldl $28, %eax, %ecx
+; X32-NEXT:    movl %ebx, %ecx
+; X32-NEXT:    shldl $28, %edi, %ecx
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    shldl $28, %esi, %eax
+; X32-NEXT:    shldl $28, %esi, %edi
+; X32-NEXT:    shldl $28, %edx, %esi
+; X32-NEXT:    shldl $28, %eax, %edx
+; X32-NEXT:    shldl $28, %ebp, %eax
 ; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    shldl $28, %edi, %esi
-; X32-NEXT:    shldl $28, %ebx, %edi
-; X32-NEXT:    shldl $28, %ebp, %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    shldl $28, %eax, %ebp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    shrdl $4, %eax, %ecx
-; X32-NEXT:    shrl $4, %edx
+; X32-NEXT:    shrl $4, %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %edx, 60(%eax)
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NEXT:    movl %edx, 56(%eax)
-; X32-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-NEXT:    movl %edx, 52(%eax)
+; X32-NEXT:    movl %ebx, 60(%eax)
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, 56(%eax)
+; X32-NEXT:    movl %edi, 52(%eax)
 ; X32-NEXT:    movl %esi, 48(%eax)
-; X32-NEXT:    movl %edi, 44(%eax)
-; X32-NEXT:    movl %ebx, 40(%eax)
+; X32-NEXT:    movl %edx, 44(%eax)
+; X32-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-NEXT:    movl %edx, 40(%eax)
 ; X32-NEXT:    movl %ebp, 36(%eax)
 ; X32-NEXT:    movl %ecx, 32(%eax)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -178,34 +178,34 @@ define <2 x i256> @test_sra(<2 x i256> %In) {
 ; X32-NEXT:    .cfi_offset %edi, -16
 ; X32-NEXT:    .cfi_offset %ebx, -12
 ; X32-NEXT:    .cfi_offset %ebp, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    shldl $26, %eax, %ecx
+; X32-NEXT:    movl %ebx, %ecx
+; X32-NEXT:    shldl $26, %edi, %ecx
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    shldl $26, %esi, %eax
+; X32-NEXT:    shldl $26, %esi, %edi
+; X32-NEXT:    shldl $26, %edx, %esi
+; X32-NEXT:    shldl $26, %eax, %edx
+; X32-NEXT:    shldl $26, %ebp, %eax
 ; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    shldl $26, %edi, %esi
-; X32-NEXT:    shldl $26, %ebx, %edi
-; X32-NEXT:    shldl $26, %ebp, %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    shldl $26, %eax, %ebp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    shrdl $6, %eax, %ecx
-; X32-NEXT:    sarl $6, %edx
+; X32-NEXT:    sarl $6, %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %edx, 60(%eax)
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NEXT:    movl %edx, 56(%eax)
-; X32-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-NEXT:    movl %edx, 52(%eax)
+; X32-NEXT:    movl %ebx, 60(%eax)
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, 56(%eax)
+; X32-NEXT:    movl %edi, 52(%eax)
 ; X32-NEXT:    movl %esi, 48(%eax)
-; X32-NEXT:    movl %edi, 44(%eax)
-; X32-NEXT:    movl %ebx, 40(%eax)
+; X32-NEXT:    movl %edx, 44(%eax)
+; X32-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-NEXT:    movl %edx, 40(%eax)
 ; X32-NEXT:    movl %ebp, 36(%eax)
 ; X32-NEXT:    movl %ecx, 32(%eax)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx

diff  --git a/llvm/test/CodeGen/X86/load-combine.ll b/llvm/test/CodeGen/X86/load-combine.ll
index 5184e99d01804..a046b55aab333 100644
--- a/llvm/test/CodeGen/X86/load-combine.ll
+++ b/llvm/test/CodeGen/X86/load-combine.ll
@@ -483,19 +483,19 @@ define i32 @load_i32_by_i8_bswap_store_in_between(i32* %arg, i32* %arg1) {
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    .cfi_offset %esi, -8
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movzbl (%ecx), %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movzbl (%eax), %edx
 ; CHECK-NEXT:    shll $24, %edx
-; CHECK-NEXT:    movzbl 1(%ecx), %esi
-; CHECK-NEXT:    movl $0, (%eax)
+; CHECK-NEXT:    movzbl 1(%eax), %esi
+; CHECK-NEXT:    movl $0, (%ecx)
 ; CHECK-NEXT:    shll $16, %esi
 ; CHECK-NEXT:    orl %edx, %esi
-; CHECK-NEXT:    movzbl 2(%ecx), %edx
-; CHECK-NEXT:    shll $8, %edx
-; CHECK-NEXT:    orl %esi, %edx
-; CHECK-NEXT:    movzbl 3(%ecx), %eax
-; CHECK-NEXT:    orl %edx, %eax
+; CHECK-NEXT:    movzbl 2(%eax), %ecx
+; CHECK-NEXT:    shll $8, %ecx
+; CHECK-NEXT:    orl %esi, %ecx
+; CHECK-NEXT:    movzbl 3(%eax), %eax
+; CHECK-NEXT:    orl %ecx, %eax
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 4
 ; CHECK-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 32fe0a3a45afe..c54fd4b4e736e 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -4104,16 +4104,16 @@ define <16 x double> @test_gather_setcc_split(double* %base, <16 x i32> %ind, <1
 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_32-NEXT:    andl $-64, %esp
 ; KNL_32-NEXT:    subl $64, %esp
-; KNL_32-NEXT:    vmovapd 72(%ebp), %zmm3
+; KNL_32-NEXT:    vmovdqa64 %zmm1, %zmm3
+; KNL_32-NEXT:    vmovapd 72(%ebp), %zmm1
 ; KNL_32-NEXT:    movl 8(%ebp), %eax
-; KNL_32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; KNL_32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
 ; KNL_32-NEXT:    vptestnmd %zmm4, %zmm4, %k1
-; KNL_32-NEXT:    vptestnmd %zmm1, %zmm1, %k2
+; KNL_32-NEXT:    vptestnmd %zmm3, %zmm3, %k2
 ; KNL_32-NEXT:    vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
 ; KNL_32-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; KNL_32-NEXT:    vgatherdpd (%eax,%ymm0,8), %zmm3 {%k1}
+; KNL_32-NEXT:    vgatherdpd (%eax,%ymm0,8), %zmm1 {%k1}
 ; KNL_32-NEXT:    vmovapd %zmm2, %zmm0
-; KNL_32-NEXT:    vmovapd %zmm3, %zmm1
 ; KNL_32-NEXT:    movl %ebp, %esp
 ; KNL_32-NEXT:    popl %ebp
 ; KNL_32-NEXT:    .cfi_def_cfa %esp, 4

diff  --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
index edfaaaed7d849..72efffb74b37c 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
@@ -632,20 +632,20 @@ define i1 @length13_eq(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length13_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl 8(%ecx), %edx
-; X86-NEXT:    xorl 8(%eax), %edx
-; X86-NEXT:    movb 12(%ecx), %cl
-; X86-NEXT:    xorb 12(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl (%edx), %esi
+; X86-NEXT:    movl 4(%edx), %eax
+; X86-NEXT:    xorl (%ecx), %esi
+; X86-NEXT:    xorl 4(%ecx), %eax
 ; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl 8(%edx), %esi
+; X86-NEXT:    xorl 8(%ecx), %esi
+; X86-NEXT:    movb 12(%edx), %dl
+; X86-NEXT:    xorb 12(%ecx), %dl
+; X86-NEXT:    movzbl %dl, %ecx
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -658,20 +658,20 @@ define i1 @length14_eq(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length14_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl 8(%ecx), %edx
-; X86-NEXT:    xorl 8(%eax), %edx
-; X86-NEXT:    movzwl 12(%ecx), %ecx
-; X86-NEXT:    xorw 12(%eax), %cx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl (%edx), %esi
+; X86-NEXT:    movl 4(%edx), %eax
+; X86-NEXT:    xorl (%ecx), %esi
+; X86-NEXT:    xorl 4(%ecx), %eax
 ; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl 8(%edx), %esi
+; X86-NEXT:    xorl 8(%ecx), %esi
+; X86-NEXT:    movzwl 12(%edx), %edx
+; X86-NEXT:    xorw 12(%ecx), %dx
+; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -684,19 +684,19 @@ define i1 @length15_eq(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length15_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl 8(%ecx), %edx
-; X86-NEXT:    xorl 8(%eax), %edx
-; X86-NEXT:    movl 11(%ecx), %ecx
-; X86-NEXT:    xorl 11(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl (%edx), %esi
+; X86-NEXT:    movl 4(%edx), %eax
+; X86-NEXT:    xorl (%ecx), %esi
+; X86-NEXT:    xorl 4(%ecx), %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl 8(%edx), %esi
+; X86-NEXT:    xorl 8(%ecx), %esi
+; X86-NEXT:    movl 11(%edx), %edx
+; X86-NEXT:    xorl 11(%ecx), %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -757,19 +757,19 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind {
 ; X86-NOSSE-LABEL: length16_eq:
 ; X86-NOSSE:       # %bb.0:
 ; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl (%ecx), %edx
-; X86-NOSSE-NEXT:    movl 4(%ecx), %esi
-; X86-NOSSE-NEXT:    xorl (%eax), %edx
-; X86-NOSSE-NEXT:    xorl 4(%eax), %esi
-; X86-NOSSE-NEXT:    orl %edx, %esi
-; X86-NOSSE-NEXT:    movl 8(%ecx), %edx
-; X86-NOSSE-NEXT:    xorl 8(%eax), %edx
-; X86-NOSSE-NEXT:    movl 12(%ecx), %ecx
-; X86-NOSSE-NEXT:    xorl 12(%eax), %ecx
-; X86-NOSSE-NEXT:    orl %edx, %ecx
-; X86-NOSSE-NEXT:    orl %esi, %ecx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl (%edx), %esi
+; X86-NOSSE-NEXT:    movl 4(%edx), %eax
+; X86-NOSSE-NEXT:    xorl (%ecx), %esi
+; X86-NOSSE-NEXT:    xorl 4(%ecx), %eax
+; X86-NOSSE-NEXT:    orl %esi, %eax
+; X86-NOSSE-NEXT:    movl 8(%edx), %esi
+; X86-NOSSE-NEXT:    xorl 8(%ecx), %esi
+; X86-NOSSE-NEXT:    movl 12(%edx), %edx
+; X86-NOSSE-NEXT:    xorl 12(%ecx), %edx
+; X86-NOSSE-NEXT:    orl %esi, %edx
+; X86-NOSSE-NEXT:    orl %eax, %edx
 ; X86-NOSSE-NEXT:    setne %al
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    retl
@@ -777,19 +777,19 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind {
 ; X86-SSE1-LABEL: length16_eq:
 ; X86-SSE1:       # %bb.0:
 ; X86-SSE1-NEXT:    pushl %esi
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl (%ecx), %edx
-; X86-SSE1-NEXT:    movl 4(%ecx), %esi
-; X86-SSE1-NEXT:    xorl (%eax), %edx
-; X86-SSE1-NEXT:    xorl 4(%eax), %esi
-; X86-SSE1-NEXT:    orl %edx, %esi
-; X86-SSE1-NEXT:    movl 8(%ecx), %edx
-; X86-SSE1-NEXT:    xorl 8(%eax), %edx
-; X86-SSE1-NEXT:    movl 12(%ecx), %ecx
-; X86-SSE1-NEXT:    xorl 12(%eax), %ecx
-; X86-SSE1-NEXT:    orl %edx, %ecx
-; X86-SSE1-NEXT:    orl %esi, %ecx
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE1-NEXT:    movl (%edx), %esi
+; X86-SSE1-NEXT:    movl 4(%edx), %eax
+; X86-SSE1-NEXT:    xorl (%ecx), %esi
+; X86-SSE1-NEXT:    xorl 4(%ecx), %eax
+; X86-SSE1-NEXT:    orl %esi, %eax
+; X86-SSE1-NEXT:    movl 8(%edx), %esi
+; X86-SSE1-NEXT:    xorl 8(%ecx), %esi
+; X86-SSE1-NEXT:    movl 12(%edx), %edx
+; X86-SSE1-NEXT:    xorl 12(%ecx), %edx
+; X86-SSE1-NEXT:    orl %esi, %edx
+; X86-SSE1-NEXT:    orl %eax, %edx
 ; X86-SSE1-NEXT:    setne %al
 ; X86-SSE1-NEXT:    popl %esi
 ; X86-SSE1-NEXT:    retl
@@ -1841,21 +1841,21 @@ define i1 @length63_eq(i8* %x, i8* %y) nounwind {
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm2
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
 ; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb %xmm2, %xmm1
+; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm2
 ; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm3
-; X86-SSE2-NEXT:    movdqu 47(%ecx), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb %xmm2, %xmm3
+; X86-SSE2-NEXT:    movdqu 47(%ecx), %xmm2
 ; X86-SSE2-NEXT:    movdqu 47(%eax), %xmm4
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm4
+; X86-SSE2-NEXT:    pcmpeqb %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pand %xmm3, %xmm4
+; X86-SSE2-NEXT:    pand %xmm1, %xmm4
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm4
-; X86-SSE2-NEXT:    pand %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pmovmskb %xmm4, %eax
 ; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X86-SSE2-NEXT:    setne %al
@@ -1865,21 +1865,21 @@ define i1 @length63_eq(i8* %x, i8* %y) nounwind {
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu (%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm2
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
 ; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE41-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm2
 ; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm3
-; X86-SSE41-NEXT:    movdqu 47(%ecx), %xmm1
+; X86-SSE41-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE41-NEXT:    movdqu 47(%ecx), %xmm2
 ; X86-SSE41-NEXT:    movdqu 47(%eax), %xmm4
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm4
+; X86-SSE41-NEXT:    pxor %xmm2, %xmm4
 ; X86-SSE41-NEXT:    por %xmm3, %xmm4
+; X86-SSE41-NEXT:    por %xmm1, %xmm4
 ; X86-SSE41-NEXT:    por %xmm0, %xmm4
-; X86-SSE41-NEXT:    por %xmm2, %xmm4
 ; X86-SSE41-NEXT:    ptest %xmm4, %xmm4
 ; X86-SSE41-NEXT:    setne %al
 ; X86-SSE41-NEXT:    retl
@@ -2024,21 +2024,21 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind {
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm2
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
 ; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb %xmm2, %xmm1
+; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm2
 ; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm3
-; X86-SSE2-NEXT:    movdqu 48(%ecx), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb %xmm2, %xmm3
+; X86-SSE2-NEXT:    movdqu 48(%ecx), %xmm2
 ; X86-SSE2-NEXT:    movdqu 48(%eax), %xmm4
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm4
+; X86-SSE2-NEXT:    pcmpeqb %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pand %xmm3, %xmm4
+; X86-SSE2-NEXT:    pand %xmm1, %xmm4
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm4
-; X86-SSE2-NEXT:    pand %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pmovmskb %xmm4, %eax
 ; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X86-SSE2-NEXT:    setne %al
@@ -2048,21 +2048,21 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind {
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu (%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm2
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
 ; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE41-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm2
 ; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm3
-; X86-SSE41-NEXT:    movdqu 48(%ecx), %xmm1
+; X86-SSE41-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE41-NEXT:    movdqu 48(%ecx), %xmm2
 ; X86-SSE41-NEXT:    movdqu 48(%eax), %xmm4
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm4
+; X86-SSE41-NEXT:    pxor %xmm2, %xmm4
 ; X86-SSE41-NEXT:    por %xmm3, %xmm4
+; X86-SSE41-NEXT:    por %xmm1, %xmm4
 ; X86-SSE41-NEXT:    por %xmm0, %xmm4
-; X86-SSE41-NEXT:    por %xmm2, %xmm4
 ; X86-SSE41-NEXT:    ptest %xmm4, %xmm4
 ; X86-SSE41-NEXT:    setne %al
 ; X86-SSE41-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll
index cdc38d71b411b..a4fa485f35b2a 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll
@@ -302,28 +302,28 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm0
+; X86-SSE2-NEXT:    movd %xmm1, %ecx
 ; X86-SSE2-NEXT:    movntil %ecx, (%eax)
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
 ; X86-SSE2-NEXT:    movd %xmm2, %ecx
 ; X86-SSE2-NEXT:    movntil %ecx, 12(%eax)
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
 ; X86-SSE2-NEXT:    movd %xmm2, %ecx
 ; X86-SSE2-NEXT:    movntil %ecx, 8(%eax)
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-SSE2-NEXT:    movd %xmm0, %ecx
-; X86-SSE2-NEXT:    movntil %ecx, 4(%eax)
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
 ; X86-SSE2-NEXT:    movd %xmm1, %ecx
-; X86-SSE2-NEXT:    movntil %ecx, 16(%eax)
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
+; X86-SSE2-NEXT:    movntil %ecx, 4(%eax)
 ; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    movntil %ecx, 16(%eax)
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X86-SSE2-NEXT:    movd %xmm1, %ecx
 ; X86-SSE2-NEXT:    movntil %ecx, 28(%eax)
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-SSE2-NEXT:    movd %xmm1, %ecx
 ; X86-SSE2-NEXT:    movntil %ecx, 24(%eax)
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; X86-SSE2-NEXT:    movd %xmm0, %ecx
 ; X86-SSE2-NEXT:    movntil %ecx, 20(%eax)
 ; X86-SSE2-NEXT:    retl
@@ -415,28 +415,28 @@ define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind {
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm0
+; X86-SSE2-NEXT:    movd %xmm1, %ecx
 ; X86-SSE2-NEXT:    movntil %ecx, (%eax)
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
 ; X86-SSE2-NEXT:    movd %xmm2, %ecx
 ; X86-SSE2-NEXT:    movntil %ecx, 12(%eax)
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
 ; X86-SSE2-NEXT:    movd %xmm2, %ecx
 ; X86-SSE2-NEXT:    movntil %ecx, 8(%eax)
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-SSE2-NEXT:    movd %xmm0, %ecx
-; X86-SSE2-NEXT:    movntil %ecx, 4(%eax)
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
 ; X86-SSE2-NEXT:    movd %xmm1, %ecx
-; X86-SSE2-NEXT:    movntil %ecx, 16(%eax)
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
+; X86-SSE2-NEXT:    movntil %ecx, 4(%eax)
 ; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    movntil %ecx, 16(%eax)
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X86-SSE2-NEXT:    movd %xmm1, %ecx
 ; X86-SSE2-NEXT:    movntil %ecx, 28(%eax)
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-SSE2-NEXT:    movd %xmm1, %ecx
 ; X86-SSE2-NEXT:    movntil %ecx, 24(%eax)
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; X86-SSE2-NEXT:    movd %xmm0, %ecx
 ; X86-SSE2-NEXT:    movntil %ecx, 20(%eax)
 ; X86-SSE2-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll
index d4f0f751c03ec..278418a1096e6 100644
--- a/llvm/test/CodeGen/X86/mmx-arith.ll
+++ b/llvm/test/CodeGen/X86/mmx-arith.ll
@@ -142,12 +142,12 @@ entry:
 define void @test1(x86_mmx* %A, x86_mmx* %B) {
 ; X32-LABEL: test1:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X32-NEXT:    paddd %xmm0, %xmm1
-; X32-NEXT:    movq %xmm1, (%ecx)
+; X32-NEXT:    movq %xmm1, (%eax)
 ; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; X32-NEXT:    pmuludq %xmm0, %xmm1
@@ -156,16 +156,16 @@ define void @test1(x86_mmx* %A, x86_mmx* %B) {
 ; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X32-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-NEXT:    movq %xmm1, (%ecx)
+; X32-NEXT:    movq %xmm1, (%eax)
 ; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X32-NEXT:    pand %xmm1, %xmm0
-; X32-NEXT:    movq %xmm0, (%ecx)
+; X32-NEXT:    movq %xmm0, (%eax)
 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X32-NEXT:    por %xmm0, %xmm1
-; X32-NEXT:    movq %xmm1, (%ecx)
+; X32-NEXT:    movq %xmm1, (%eax)
 ; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X32-NEXT:    pxor %xmm1, %xmm0
-; X32-NEXT:    movq %xmm0, (%ecx)
+; X32-NEXT:    movq %xmm0, (%eax)
 ; X32-NEXT:    emms
 ; X32-NEXT:    retl
 ;

diff  --git a/llvm/test/CodeGen/X86/mul-constant-i64.ll b/llvm/test/CodeGen/X86/mul-constant-i64.ll
index c875e084f4861..a1f801b415d12 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i64.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i64.ll
@@ -1479,28 +1479,28 @@ define i64 @test_mul_spec(i64 %x) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl $9, %edx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl $9, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    leal (%ebx,%ebx,8), %edi
+; X86-NEXT:    addl $42, %ecx
+; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    movl $5, %edx
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    leal (%edi,%edi,8), %ebx
-; X86-NEXT:    addl $42, %esi
+; X86-NEXT:    leal (%ebx,%ebx,4), %ebx
+; X86-NEXT:    addl $2, %esi
 ; X86-NEXT:    adcl %edx, %ebx
-; X86-NEXT:    movl $5, %edx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    leal (%edi,%edi,4), %edi
-; X86-NEXT:    addl $2, %ecx
-; X86-NEXT:    adcl %edx, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    imull %esi, %edi
-; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    imull %ebx, %ecx
-; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    mull %esi
+; X86-NEXT:    imull %ecx, %ebx
+; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    imull %edi, %esi
+; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1511,28 +1511,28 @@ define i64 @test_mul_spec(i64 %x) nounwind {
 ; X86-NOOPT-NEXT:    pushl %ebx
 ; X86-NOOPT-NEXT:    pushl %edi
 ; X86-NOOPT-NEXT:    pushl %esi
-; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOOPT-NEXT:    movl $9, %edx
-; X86-NOOPT-NEXT:    movl %ecx, %eax
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOOPT-NEXT:    movl $9, %ecx
+; X86-NOOPT-NEXT:    movl %esi, %eax
+; X86-NOOPT-NEXT:    mull %ecx
+; X86-NOOPT-NEXT:    movl %eax, %ecx
+; X86-NOOPT-NEXT:    leal (%ebx,%ebx,8), %edi
+; X86-NOOPT-NEXT:    addl $42, %ecx
+; X86-NOOPT-NEXT:    adcl %edx, %edi
+; X86-NOOPT-NEXT:    movl $5, %edx
+; X86-NOOPT-NEXT:    movl %esi, %eax
 ; X86-NOOPT-NEXT:    mull %edx
 ; X86-NOOPT-NEXT:    movl %eax, %esi
-; X86-NOOPT-NEXT:    leal (%edi,%edi,8), %ebx
-; X86-NOOPT-NEXT:    addl $42, %esi
+; X86-NOOPT-NEXT:    leal (%ebx,%ebx,4), %ebx
+; X86-NOOPT-NEXT:    addl $2, %esi
 ; X86-NOOPT-NEXT:    adcl %edx, %ebx
-; X86-NOOPT-NEXT:    movl $5, %edx
 ; X86-NOOPT-NEXT:    movl %ecx, %eax
-; X86-NOOPT-NEXT:    mull %edx
-; X86-NOOPT-NEXT:    movl %eax, %ecx
-; X86-NOOPT-NEXT:    leal (%edi,%edi,4), %edi
-; X86-NOOPT-NEXT:    addl $2, %ecx
-; X86-NOOPT-NEXT:    adcl %edx, %edi
-; X86-NOOPT-NEXT:    movl %esi, %eax
-; X86-NOOPT-NEXT:    mull %ecx
-; X86-NOOPT-NEXT:    imull %esi, %edi
-; X86-NOOPT-NEXT:    addl %edi, %edx
-; X86-NOOPT-NEXT:    imull %ebx, %ecx
-; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    mull %esi
+; X86-NOOPT-NEXT:    imull %ecx, %ebx
+; X86-NOOPT-NEXT:    addl %ebx, %edx
+; X86-NOOPT-NEXT:    imull %edi, %esi
+; X86-NOOPT-NEXT:    addl %esi, %edx
 ; X86-NOOPT-NEXT:    popl %esi
 ; X86-NOOPT-NEXT:    popl %edi
 ; X86-NOOPT-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/mul-constant-result.ll b/llvm/test/CodeGen/X86/mul-constant-result.ll
index 25b11d1bca3e3..a8e599f128da2 100644
--- a/llvm/test/CodeGen/X86/mul-constant-result.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-result.ll
@@ -542,8 +542,8 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $2, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $2, %ebx
 ; X86-NEXT:    pushl $1
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $3
@@ -551,290 +551,290 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $3, %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    pushl $2
-; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $4
-; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    calll mult at PLT
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $4, %edi
+; X86-NEXT:    xorl $3, %edi
 ; X86-NEXT:    orl %ebx, %edi
 ; X86-NEXT:    pushl $2
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $5
+; X86-NEXT:    pushl $4
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $5, %ebx
+; X86-NEXT:    xorl $4, %ebx
 ; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    pushl $3
+; X86-NEXT:    pushl $2
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $6
+; X86-NEXT:    pushl $5
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $6, %edi
+; X86-NEXT:    xorl $5, %edi
 ; X86-NEXT:    orl %ebx, %edi
 ; X86-NEXT:    pushl $3
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $7
+; X86-NEXT:    pushl $6
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $7, %ebx
+; X86-NEXT:    xorl $6, %ebx
 ; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    pushl $4
+; X86-NEXT:    pushl $3
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $8
+; X86-NEXT:    pushl $7
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $8, %edi
+; X86-NEXT:    xorl $7, %edi
 ; X86-NEXT:    orl %ebx, %edi
 ; X86-NEXT:    pushl $4
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $9
+; X86-NEXT:    pushl $8
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $9, %ebx
+; X86-NEXT:    xorl $8, %ebx
 ; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    pushl $5
+; X86-NEXT:    pushl $4
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $10
+; X86-NEXT:    pushl $9
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $10, %edi
+; X86-NEXT:    xorl $9, %edi
 ; X86-NEXT:    orl %ebx, %edi
 ; X86-NEXT:    pushl $5
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $11
+; X86-NEXT:    pushl $10
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $11, %ebx
+; X86-NEXT:    xorl $10, %ebx
 ; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    pushl $6
+; X86-NEXT:    pushl $5
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $12
+; X86-NEXT:    pushl $11
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $12, %edi
+; X86-NEXT:    xorl $11, %edi
 ; X86-NEXT:    orl %ebx, %edi
 ; X86-NEXT:    pushl $6
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $13
+; X86-NEXT:    pushl $12
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $13, %ebx
+; X86-NEXT:    xorl $12, %ebx
 ; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    pushl $7
+; X86-NEXT:    pushl $6
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $14
+; X86-NEXT:    pushl $13
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $14, %edi
+; X86-NEXT:    xorl $13, %edi
 ; X86-NEXT:    orl %ebx, %edi
 ; X86-NEXT:    pushl $7
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $15
+; X86-NEXT:    pushl $14
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $15, %ebx
+; X86-NEXT:    xorl $14, %ebx
 ; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    pushl $8
+; X86-NEXT:    pushl $7
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $16
+; X86-NEXT:    pushl $15
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $16, %edi
+; X86-NEXT:    xorl $15, %edi
 ; X86-NEXT:    orl %ebx, %edi
 ; X86-NEXT:    pushl $8
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $17
+; X86-NEXT:    pushl $16
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $17, %ebx
+; X86-NEXT:    xorl $16, %ebx
 ; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    pushl $9
+; X86-NEXT:    pushl $8
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $18
+; X86-NEXT:    pushl $17
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $18, %edi
+; X86-NEXT:    xorl $17, %edi
 ; X86-NEXT:    orl %ebx, %edi
 ; X86-NEXT:    pushl $9
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $19
+; X86-NEXT:    pushl $18
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $19, %ebx
+; X86-NEXT:    xorl $18, %ebx
 ; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    pushl $10
+; X86-NEXT:    pushl $9
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $20
+; X86-NEXT:    pushl $19
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $20, %edi
+; X86-NEXT:    xorl $19, %edi
 ; X86-NEXT:    orl %ebx, %edi
 ; X86-NEXT:    pushl $10
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $21
+; X86-NEXT:    pushl $20
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $21, %ebx
+; X86-NEXT:    xorl $20, %ebx
 ; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    pushl $11
+; X86-NEXT:    pushl $10
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $22
+; X86-NEXT:    pushl $21
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $22, %edi
+; X86-NEXT:    xorl $21, %edi
 ; X86-NEXT:    orl %ebx, %edi
 ; X86-NEXT:    pushl $11
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $23
+; X86-NEXT:    pushl $22
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $23, %ebx
+; X86-NEXT:    xorl $22, %ebx
 ; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    pushl $12
+; X86-NEXT:    pushl $11
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $24
+; X86-NEXT:    pushl $23
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $24, %edi
+; X86-NEXT:    xorl $23, %edi
 ; X86-NEXT:    orl %ebx, %edi
 ; X86-NEXT:    pushl $12
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $25
+; X86-NEXT:    pushl $24
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $25, %ebx
+; X86-NEXT:    xorl $24, %ebx
 ; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    pushl $13
+; X86-NEXT:    pushl $12
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $26
+; X86-NEXT:    pushl $25
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $26, %edi
+; X86-NEXT:    xorl $25, %edi
 ; X86-NEXT:    orl %ebx, %edi
 ; X86-NEXT:    pushl $13
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $27
+; X86-NEXT:    pushl $26
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $27, %ebx
+; X86-NEXT:    xorl $26, %ebx
 ; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    pushl $14
+; X86-NEXT:    pushl $13
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $28
+; X86-NEXT:    pushl $27
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $28, %edi
+; X86-NEXT:    xorl $27, %edi
 ; X86-NEXT:    orl %ebx, %edi
 ; X86-NEXT:    pushl $14
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $29
+; X86-NEXT:    pushl $28
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $29, %ebx
+; X86-NEXT:    xorl $28, %ebx
 ; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    pushl $15
+; X86-NEXT:    pushl $14
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $30
+; X86-NEXT:    pushl $29
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $30, %edi
+; X86-NEXT:    xorl $29, %edi
 ; X86-NEXT:    orl %ebx, %edi
 ; X86-NEXT:    pushl $15
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $31
+; X86-NEXT:    pushl $30
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $31, %ebx
+; X86-NEXT:    xorl $30, %ebx
 ; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    pushl $15
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $31
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult at PLT
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    pushl $16
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $32
@@ -844,7 +844,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    xorl $32, %eax
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    orl %edi, %eax
 ; X86-NEXT:    setne %cl
 ; X86-NEXT:    negl %ecx
 ; X86-NEXT:    movl %ecx, %eax

diff  --git a/llvm/test/CodeGen/X86/mul-i1024.ll b/llvm/test/CodeGen/X86/mul-i1024.ll
index 96c0543433bda..57b66b09e99a6 100644
--- a/llvm/test/CodeGen/X86/mul-i1024.ll
+++ b/llvm/test/CodeGen/X86/mul-i1024.ll
@@ -3143,9 +3143,10 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    imull %ebp, %esi
 ; X32-NEXT:    addl %edx, %esi
-; X32-NEXT:    movl %esi, %ebp
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
@@ -3155,51 +3156,49 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    imull %edi, %esi
 ; X32-NEXT:    addl %edx, %esi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, %edi
-; X32-NEXT:    adcl %ebp, %esi
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ebp, %ebx
+; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %ebp, %edi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ebp
-; X32-NEXT:    setb %bl
+; X32-NEXT:    setb %cl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movzbl %bl, %ecx
+; X32-NEXT:    movzbl %cl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    imull %ebp, %edi
-; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    imull %eax, %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    addl %edi, %edx
-; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl %edx, %ebp
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
@@ -3213,14 +3212,15 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    addl %edx, %ecx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    adcl %ebp, %ecx
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    addl %esi, %ecx
@@ -3390,18 +3390,17 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl 112(%ecx), %edi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    imull %edi, %ebx
-; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    imull %edi, %esi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    addl %ebx, %edx
+; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    movl 116(%ecx), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    imull %eax, %esi
-; X32-NEXT:    addl %edx, %esi
-; X32-NEXT:    movl %esi, %ebx
+; X32-NEXT:    imull %eax, %ebx
+; X32-NEXT:    addl %edx, %ebx
 ; X32-NEXT:    movl 120(%ecx), %eax
 ; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -3443,13 +3442,13 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    imull %eax, %edi
+; X32-NEXT:    imull %eax, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    addl %edi, %edx
+; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    addl %edx, %ebp
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4499,16 +4498,16 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %edi, %ecx
-; X32-NEXT:    imull %eax, %ecx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edi, %esi
+; X32-NEXT:    imull %eax, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    imull %ebp, %esi
-; X32-NEXT:    addl %edx, %esi
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    imull %ebp, %ecx
+; X32-NEXT:    addl %edx, %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
@@ -4553,16 +4552,15 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    imull %esi, %ebx
-; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    imull %esi, %ecx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    addl %ebx, %edx
-; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    movl %ecx, %ebx
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl %edx, %ebx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
@@ -4608,45 +4606,46 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    movl %ecx, %ebp
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, %edi
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4667,6 +4666,7 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -5044,7 +5044,7 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    setb %r14b
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    movq %r10, (%rsp) # 8-byte Spill
+; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r11
@@ -5119,7 +5119,7 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    adcq %rax, %r10
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
 ; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r11, (%rsp) # 8-byte Spill
 ; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -5239,7 +5239,7 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    movq (%rsp), %rsi # 8-byte Reload
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -5384,7 +5384,7 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; X64-NEXT:    movq 64(%rsi), %rdi
-; X64-NEXT:    movq (%rsp), %rbx # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
 ; X64-NEXT:    movq %rbx, %rax
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rcx
@@ -5483,20 +5483,19 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    addq %r15, %rsi
 ; X64-NEXT:    adcq %r10, %rbx
 ; X64-NEXT:    setb %r9b
-; X64-NEXT:    movq (%rsp), %r14 # 8-byte Reload
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rax, %r10
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
 ; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %rcx, %rbp
+; X64-NEXT:    addq %r14, %rbp
 ; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq %r14, %r15
+; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %rbp, %rax
@@ -5504,6 +5503,7 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    adcq %rdi, %rcx
 ; X64-NEXT:    setb %dil
 ; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %r8, %r14
 ; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rax, %r12
 ; X64-NEXT:    addq %rcx, %r12
@@ -5517,11 +5517,11 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    adcq %rax, %r12
 ; X64-NEXT:    adcq $0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    movq (%rsp), %rax # 8-byte Reload
 ; X64-NEXT:    imulq %rax, %r13
-; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    movq %rax, %r15
 ; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    movq %rax, %r10
 ; X64-NEXT:    addq %r13, %rdx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; X64-NEXT:    imulq %rdi, %r11
@@ -5534,19 +5534,18 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    addq %rsi, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    imulq %rbp, %rax
-; X64-NEXT:    addq %rdx, %rax
-; X64-NEXT:    addq %r8, %r9
-; X64-NEXT:    adcq %r11, %rax
-; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    imulq %rbp, %r8
+; X64-NEXT:    addq %rdx, %r8
+; X64-NEXT:    addq %r10, %r9
+; X64-NEXT:    adcq %r11, %r8
 ; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rbx, %rax
 ; X64-NEXT:    movq %rbx, %r11
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %rcx, %rbx
@@ -5554,8 +5553,8 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    movq %rbp, %rax
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %rbx, %r14
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %rbx, %r10
 ; X64-NEXT:    adcq %rsi, %rbp
 ; X64-NEXT:    setb %cl
 ; X64-NEXT:    movq %r11, %rax
@@ -5575,29 +5574,28 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    movq %rax, %r15
 ; X64-NEXT:    addq %rcx, %rdx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
 ; X64-NEXT:    imulq %r8, %rsi
 ; X64-NEXT:    addq %rdx, %rsi
 ; X64-NEXT:    movq 96(%rdi), %rcx
 ; X64-NEXT:    movq 104(%rdi), %rbp
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    imulq %rbp, %rdi
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    addq %rdi, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    imulq %rcx, %rax
-; X64-NEXT:    addq %rdx, %rax
-; X64-NEXT:    addq %r10, %r9
-; X64-NEXT:    adcq %rsi, %rax
-; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    imulq %rcx, %r14
+; X64-NEXT:    addq %rdx, %r14
+; X64-NEXT:    addq %r15, %r9
+; X64-NEXT:    adcq %rsi, %r14
+; X64-NEXT:    movq %r14, %r15
 ; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r15
+; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    movq %rbp, %rax
 ; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rdx, %rbx
@@ -5617,13 +5615,13 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    movzbl %bl, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    addq %r9, %rax
-; X64-NEXT:    adcq %r10, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; X64-NEXT:    adcq %r14, %rsi
+; X64-NEXT:    adcq %r15, %rdx
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT:    adcq %r10, %rsi
 ; X64-NEXT:    adcq %r11, %rax
 ; X64-NEXT:    adcq %r13, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
 ; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %r12, %rax
@@ -5666,7 +5664,7 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    movq %r13, %rax
 ; X64-NEXT:    movq %r14, %rdi
 ; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq 72(%r9), %rax
 ; X64-NEXT:    movq %rax, %r9
@@ -5680,7 +5678,7 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    addq %rbp, %rax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rax, (%rsp) # 8-byte Spill
 ; X64-NEXT:    adcq %rdi, %rsi
 ; X64-NEXT:    setb %cl
 ; X64-NEXT:    movq %r9, %rax
@@ -5873,9 +5871,9 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT:    movq (%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq (%rsp), %rdi # 8-byte Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload

diff  --git a/llvm/test/CodeGen/X86/mul-i256.ll b/llvm/test/CodeGen/X86/mul-i256.ll
index 7d1a3b70e1741..b3883163a902c 100644
--- a/llvm/test/CodeGen/X86/mul-i256.ll
+++ b/llvm/test/CodeGen/X86/mul-i256.ll
@@ -58,30 +58,31 @@ define void @test(i256* %a, i256* %b, i256* %out) #0 {
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl (%edi), %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl (%edi), %esi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl 4(%edi), %eax
-; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %esi, %ebp
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebx, %ecx
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl %ecx, %ebp
 ; X32-NEXT:    movzbl %bl, %eax
@@ -92,24 +93,25 @@ define void @test(i256* %a, i256* %b, i256* %out) #0 {
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 8(%eax), %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl 8(%eax), %edi
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 12(%eax), %ecx
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl 12(%eax), %esi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %ecx
+; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, %edi
@@ -166,12 +168,13 @@ define void @test(i256* %a, i256* %b, i256* %out) #0 {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl 16(%ecx), %edi
-; X32-NEXT:    imull %edi, %ebx
+; X32-NEXT:    movl %ebx, %esi
+; X32-NEXT:    imull %edi, %esi
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    addl %ebx, %edx
+; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    movl 20(%ecx), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    imull %eax, %ebp
@@ -217,41 +220,41 @@ define void @test(i256* %a, i256* %b, i256* %out) #0 {
 ; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    movl 28(%ebx), %ecx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    imull %esi, %ecx
-; X32-NEXT:    movl 24(%ebx), %edi
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl 28(%edi), %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    imull %eax, %esi
+; X32-NEXT:    movl 24(%edi), %ecx
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    addl %edx, %edi
-; X32-NEXT:    movl 16(%ebx), %ebp
-; X32-NEXT:    movl 20(%ebx), %ebx
+; X32-NEXT:    addl %esi, %edx
+; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl %edx, %ecx
+; X32-NEXT:    movl 16(%edi), %ebp
+; X32-NEXT:    movl 20(%edi), %ebx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    imull %ebx, %ecx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    imull %ebx, %edi
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %ebp
-; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    imull %ebp, %ecx
-; X32-NEXT:    addl %edx, %ecx
+; X32-NEXT:    addl %edi, %edx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    imull %ebp, %esi
+; X32-NEXT:    addl %edx, %esi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %ecx, %esi
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %edi, %ebx
+; X32-NEXT:    addl %esi, %ebx
 ; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
@@ -309,52 +312,55 @@ define void @test(i256* %a, i256* %b, i256* %out) #0 {
 ; X64-NEXT:    .cfi_def_cfa_offset 16
 ; X64-NEXT:    pushq %r14
 ; X64-NEXT:    .cfi_def_cfa_offset 24
-; X64-NEXT:    pushq %rbx
+; X64-NEXT:    pushq %r12
 ; X64-NEXT:    .cfi_def_cfa_offset 32
-; X64-NEXT:    .cfi_offset %rbx, -32
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 40
+; X64-NEXT:    .cfi_offset %rbx, -40
+; X64-NEXT:    .cfi_offset %r12, -32
 ; X64-NEXT:    .cfi_offset %r14, -24
 ; X64-NEXT:    .cfi_offset %r15, -16
 ; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq (%rdi), %r11
+; X64-NEXT:    movq (%rdi), %r14
 ; X64-NEXT:    movq 8(%rdi), %r8
-; X64-NEXT:    movq 16(%rdi), %rbx
-; X64-NEXT:    movq 16(%rsi), %r10
-; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    movq 16(%rdi), %rcx
+; X64-NEXT:    movq 16(%rsi), %rbx
+; X64-NEXT:    movq (%rsi), %r12
 ; X64-NEXT:    movq 8(%rsi), %r15
 ; X64-NEXT:    movq 24(%rdi), %rdi
-; X64-NEXT:    imulq %rcx, %rdi
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    imulq %r12, %rdi
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rax, %r10
 ; X64-NEXT:    addq %rdi, %rdx
-; X64-NEXT:    imulq %r15, %rbx
-; X64-NEXT:    addq %rdx, %rbx
-; X64-NEXT:    movq %r10, %rdi
+; X64-NEXT:    imulq %r15, %rcx
+; X64-NEXT:    addq %rdx, %rcx
+; X64-NEXT:    movq %rbx, %rdi
 ; X64-NEXT:    imulq %r8, %rdi
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    addq %rdi, %rdx
-; X64-NEXT:    movq 24(%rsi), %rdi
-; X64-NEXT:    imulq %r11, %rdi
-; X64-NEXT:    addq %rdx, %rdi
-; X64-NEXT:    addq %r14, %r10
-; X64-NEXT:    adcq %rbx, %rdi
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq 24(%rsi), %rbx
+; X64-NEXT:    imulq %r14, %rbx
+; X64-NEXT:    addq %rdx, %rbx
+; X64-NEXT:    addq %r10, %r11
+; X64-NEXT:    adcq %rcx, %rbx
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    movq %rax, %r10
 ; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rsi, %rbx
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    addq %rsi, %rdi
 ; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rbx, %r11
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %rdi, %r14
 ; X64-NEXT:    adcq %rcx, %rsi
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    movzbl %al, %ecx
@@ -362,13 +368,15 @@ define void @test(i256* %a, i256* %b, i256* %out) #0 {
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    addq %rsi, %rax
 ; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    addq %r10, %rax
-; X64-NEXT:    adcq %rdi, %rdx
-; X64-NEXT:    movq %r14, (%r9)
-; X64-NEXT:    movq %r11, 8(%r9)
+; X64-NEXT:    addq %r11, %rax
+; X64-NEXT:    adcq %rbx, %rdx
+; X64-NEXT:    movq %r10, (%r9)
+; X64-NEXT:    movq %r14, 8(%r9)
 ; X64-NEXT:    movq %rax, 16(%r9)
 ; X64-NEXT:    movq %rdx, 24(%r9)
 ; X64-NEXT:    popq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 32
+; X64-NEXT:    popq %r12
 ; X64-NEXT:    .cfi_def_cfa_offset 24
 ; X64-NEXT:    popq %r14
 ; X64-NEXT:    .cfi_def_cfa_offset 16

diff  --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll
index a5050467ac1af..19ac76e2ade53 100644
--- a/llvm/test/CodeGen/X86/mul-i512.ll
+++ b/llvm/test/CodeGen/X86/mul-i512.ll
@@ -9,7 +9,7 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    subl $180, %esp
+; X32-NEXT:    subl $184, %esp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl 28(%edx), %ecx
@@ -33,6 +33,7 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %ecx, %ebp
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -46,46 +47,45 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl 16(%ecx), %ebx
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 16(%ecx), %ebp
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl 20(%ecx), %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 20(%ecx), %ebx
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    addl %esi, %ecx
 ; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl %ebp, %esi
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %ebp
 ; X32-NEXT:    setb %cl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %ebp, %esi
-; X32-NEXT:    movzbl %cl, %eax
-; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    adcl %ecx, %edx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 8(%eax), %ebp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl 8(%edi), %ebp
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -95,25 +95,25 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 12(%eax), %ecx
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl 12(%edi), %ecx
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    adcl %ebp, %edi
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    addl %edi, %ecx
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
@@ -191,13 +191,13 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl 4(%ecx), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %ebx
@@ -328,7 +328,7 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -362,7 +362,7 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 24(%eax), %ebx
@@ -399,21 +399,21 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl %ebp, %ecx
@@ -421,7 +421,7 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ebp
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -431,8 +431,8 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    addl %ebx, %edi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    adcl (%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
 ; X32-NEXT:    adcl $0, %edx
@@ -454,7 +454,7 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
@@ -524,13 +524,13 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
@@ -539,13 +539,13 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    adcl %ebx, %ecx
-; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edi, %ebx
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl %ecx, %edi
-; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
@@ -554,7 +554,7 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
@@ -583,14 +583,14 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    addl %edi, %ecx
-; X32-NEXT:    adcl (%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
 ; X32-NEXT:    adcl %esi, %eax
 ; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -624,7 +624,7 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl 36(%eax), %ecx
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl %ecx, %esi
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebx, %eax
@@ -652,18 +652,18 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    addl %ecx, %ebp
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %esi
-; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl %esi, %edi
-; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -676,17 +676,17 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl (%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 44(%eax), %esi
-; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
@@ -722,14 +722,14 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull (%esp) # 4-byte Folded Reload
+; X32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    adcl %edi, %ecx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
@@ -747,7 +747,7 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    imull %eax, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    imull %ebx, %ecx
@@ -755,7 +755,7 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; X32-NEXT:    imull %edi, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
@@ -765,8 +765,8 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    imull %ebp, %esi
 ; X32-NEXT:    addl %edx, %esi
-; X32-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -787,12 +787,12 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -882,13 +882,13 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebx, %esi
 ; X32-NEXT:    setb %cl
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movzbl %cl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -923,9 +923,9 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movzbl %cl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
@@ -955,24 +955,24 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %ecx
 ; X32-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl (%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
@@ -980,17 +980,17 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    adcl %ebx, %esi
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %esi, %eax
 ; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl %ecx, %ebp
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ebp, (%esp) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %esi
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -998,21 +998,21 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl 48(%ecx), %edi
-; X32-NEXT:    imull %edi, %ebx
+; X32-NEXT:    movl %ebx, %esi
+; X32-NEXT:    imull %edi, %esi
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    addl %ebx, %edx
+; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    movl 52(%ecx), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    imull %eax, %esi
-; X32-NEXT:    addl %edx, %esi
-; X32-NEXT:    movl %esi, %ebp
+; X32-NEXT:    imull %eax, %ebx
+; X32-NEXT:    addl %edx, %ebx
 ; X32-NEXT:    movl 56(%ecx), %eax
 ; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    imull %ebx, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    imull %ebp, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    addl %esi, %edx
@@ -1022,16 +1022,16 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    addl %edx, %esi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebp, %esi
+; X32-NEXT:    adcl %ebx, %esi
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ebp, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
@@ -1050,17 +1050,18 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    imull %esi, %edi
-; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %edi, %edx
-; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    imull %eax, %ecx
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    imull %ebp, %edi
+; X32-NEXT:    addl %edx, %edi
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
@@ -1071,12 +1072,11 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    imull %ebx, %ecx
 ; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl %ebx, %ebp
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1086,60 +1086,56 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    addl %ebx, %ecx
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
-; X32-NEXT:    adcl %esi, %edi
-; X32-NEXT:    setb %cl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    adcl %esi, %ebx
+; X32-NEXT:    setb %cl
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movzbl %cl, %ecx
-; X32-NEXT:    adcl %ecx, %ebx
+; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, (%ecx)
@@ -1163,12 +1159,12 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl %esi, 36(%ecx)
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, 40(%ecx)
-; X32-NEXT:    movl %edi, 44(%ecx)
-; X32-NEXT:    movl %edx, 48(%ecx)
-; X32-NEXT:    movl %ebp, 52(%ecx)
+; X32-NEXT:    movl %ebx, 44(%ecx)
+; X32-NEXT:    movl %ebp, 48(%ecx)
+; X32-NEXT:    movl %edi, 52(%ecx)
 ; X32-NEXT:    movl %eax, 56(%ecx)
-; X32-NEXT:    movl %ebx, 60(%ecx)
-; X32-NEXT:    addl $180, %esp
+; X32-NEXT:    movl %edx, 60(%ecx)
+; X32-NEXT:    addl $184, %esp
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %edi
 ; X32-NEXT:    popl %ebx
@@ -1183,23 +1179,22 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X64-NEXT:    pushq %r13
 ; X64-NEXT:    pushq %r12
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movq %rdx, (%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq (%rdi), %r9
-; X64-NEXT:    movq 8(%rdi), %r15
-; X64-NEXT:    movq 24(%rdi), %r12
+; X64-NEXT:    movq 8(%rdi), %r8
+; X64-NEXT:    movq 24(%rdi), %r15
 ; X64-NEXT:    movq 16(%rdi), %rax
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq (%rsi), %rdi
 ; X64-NEXT:    movq 8(%rsi), %r14
+; X64-NEXT:    movq %rsi, %r12
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %rbx
@@ -1213,36 +1208,35 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X64-NEXT:    adcq %rbp, %rcx
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    movzbl %al, %esi
-; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %r14, %r15
 ; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %rcx, %r13
-; X64-NEXT:    adcq %rsi, %r8
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %rcx, %r14
+; X64-NEXT:    adcq %rsi, %rdx
+; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    addq %rcx, %rbp
 ; X64-NEXT:    adcq $0, %rbx
 ; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    movq %r9, %r12
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %r9, %rdi
+; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %rbp, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %rbx, %rcx
 ; X64-NEXT:    setb %sil
-; X64-NEXT:    movq %r15, %rdi
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %rcx, %rbx
@@ -1250,190 +1244,190 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X64-NEXT:    adcq %rax, %r15
 ; X64-NEXT:    addq %r11, %rbx
 ; X64-NEXT:    adcq %r10, %r15
-; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    movq %r8, %r14
 ; X64-NEXT:    adcq $0, %r14
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    movq 16(%rsi), %r8
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    adcq $0, %r13
+; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 16(%r12), %r9
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, %r11
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %r8, %r11
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    addq %r10, %rbp
 ; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq 24(%rsi), %rdi
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq 24(%r12), %r8
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    addq %rbp, %rax
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    adcq %rcx, %rsi
 ; X64-NEXT:    setb %cl
 ; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rax, %r12
 ; X64-NEXT:    addq %rsi, %r12
 ; X64-NEXT:    movzbl %cl, %eax
 ; X64-NEXT:    adcq %rax, %r11
-; X64-NEXT:    addq %rbx, %r9
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq %rbx, %r13
+; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %r15, %rbp
 ; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %r12
 ; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    addq %r13, %r12
-; X64-NEXT:    adcq %r14, %r11
-; X64-NEXT:    setb %r9b
+; X64-NEXT:    addq %r14, %r12
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; X64-NEXT:    setb %r15b
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
 ; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
 ; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    addq %rcx, %rbp
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %rbp, %rax
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    adcq %rsi, %rcx
 ; X64-NEXT:    setb %bl
 ; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    movzbl %bl, %ecx
-; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    addq %r12, %r14
-; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %rcx, %r14
+; X64-NEXT:    movzbl %bl, %eax
+; X64-NEXT:    adcq %rax, %rdx
+; X64-NEXT:    addq %r12, %rdi
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %r11, %rbp
 ; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movzbl %r9b, %ecx
-; X64-NEXT:    adcq %rcx, %rax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movzbl %r15b, %eax
+; X64-NEXT:    adcq %rax, %r14
 ; X64-NEXT:    adcq $0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq 32(%rcx), %r10
-; X64-NEXT:    imulq %r10, %rdi
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq 32(%rcx), %r12
+; X64-NEXT:    imulq %r12, %r8
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rdi, %rdx
-; X64-NEXT:    movq 40(%rcx), %r9
-; X64-NEXT:    imulq %r9, %r8
-; X64-NEXT:    addq %rdx, %r8
+; X64-NEXT:    addq %r8, %rdx
+; X64-NEXT:    movq 40(%rcx), %r8
+; X64-NEXT:    imulq %r8, %r9
+; X64-NEXT:    addq %rdx, %r9
 ; X64-NEXT:    movq 48(%rcx), %rax
+; X64-NEXT:    movq %rcx, %rbp
 ; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT:    imulq %r15, %rdi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT:    imulq %rbx, %rdi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    addq %rdi, %rdx
-; X64-NEXT:    movq 56(%rcx), %rbp
-; X64-NEXT:    imulq %rsi, %rbp
+; X64-NEXT:    movq 56(%rbp), %rbp
+; X64-NEXT:    imulq %rbx, %rbp
 ; X64-NEXT:    addq %rdx, %rbp
-; X64-NEXT:    addq %r11, %r12
-; X64-NEXT:    adcq %r8, %rbp
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq %r11, %rcx
+; X64-NEXT:    adcq %r9, %rbp
 ; X64-NEXT:    movq %rbx, %rax
 ; X64-NEXT:    movq %rbx, %r11
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rdi, %rbx
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %rbx, %r10
-; X64-NEXT:    adcq %rsi, %rdi
-; X64-NEXT:    setb %bl
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %r9, %rbx
+; X64-NEXT:    adcq $0, %rdi
 ; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    addq %rbx, %r12
+; X64-NEXT:    adcq %rdi, %rsi
+; X64-NEXT:    setb %bl
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %rdi, %r13
+; X64-NEXT:    addq %rsi, %r13
 ; X64-NEXT:    movzbl %bl, %eax
 ; X64-NEXT:    adcq %rax, %r15
-; X64-NEXT:    addq %r12, %r13
+; X64-NEXT:    addq %rcx, %r13
 ; X64-NEXT:    adcq %rbp, %r15
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
 ; X64-NEXT:    movq 56(%rdx), %rcx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    imulq %rax, %rcx
-; X64-NEXT:    movq 48(%rdx), %rbp
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    mulq %rbp
-; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    movq 48(%rdx), %rbx
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    addq %rcx, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    imulq %r8, %rbp
-; X64-NEXT:    addq %rdx, %rbp
-; X64-NEXT:    movq 32(%rsi), %rdi
-; X64-NEXT:    movq 40(%rsi), %rbx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    imulq %r9, %rbx
+; X64-NEXT:    addq %rdx, %rbx
+; X64-NEXT:    movq 32(%r8), %rdi
+; X64-NEXT:    movq 40(%r8), %r8
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    imulq %rbx, %rsi
-; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %rsi, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    imulq %rdi, %r14
-; X64-NEXT:    addq %rdx, %r14
-; X64-NEXT:    addq %r12, %rcx
-; X64-NEXT:    adcq %rbp, %r14
+; X64-NEXT:    imulq %r8, %rcx
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    addq %rcx, %rdx
+; X64-NEXT:    imulq %rdi, %r10
+; X64-NEXT:    addq %rdx, %r10
+; X64-NEXT:    addq %rsi, %r11
+; X64-NEXT:    adcq %rbx, %r10
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %rbp
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %rbp, %r12
-; X64-NEXT:    adcq $0, %r11
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    addq %rbx, %rbp
+; X64-NEXT:    adcq $0, %rcx
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rbp
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %r12, %rdi
-; X64-NEXT:    adcq %r11, %rbp
-; X64-NEXT:    setb %r9b
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    addq %rbp, %rax
-; X64-NEXT:    movzbl %r9b, %ebp
-; X64-NEXT:    adcq %rbp, %rdx
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    adcq %r14, %rdx
+; X64-NEXT:    addq %rbp, %rdi
+; X64-NEXT:    adcq %rcx, %rbx
+; X64-NEXT:    setb %cl
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    addq %rbx, %rax
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    adcq %rcx, %rdx
+; X64-NEXT:    addq %r11, %rax
+; X64-NEXT:    adcq %r10, %rdx
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT:    adcq %r10, %rdi
+; X64-NEXT:    adcq %r12, %rdi
 ; X64-NEXT:    adcq %r13, %rax
 ; X64-NEXT:    adcq %r15, %rdx
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; X64-NEXT:    adcq %r14, %rax
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT:    movq (%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
 ; X64-NEXT:    movq %rbp, (%rcx)
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
@@ -1446,7 +1440,6 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X64-NEXT:    movq %rdi, 40(%rcx)
 ; X64-NEXT:    movq %rax, 48(%rcx)
 ; X64-NEXT:    movq %rdx, 56(%rcx)
-; X64-NEXT:    addq $8, %rsp
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r12
 ; X64-NEXT:    popq %r13

diff  --git a/llvm/test/CodeGen/X86/mul128.ll b/llvm/test/CodeGen/X86/mul128.ll
index fd517357d5fa1..44cc1feb8188c 100644
--- a/llvm/test/CodeGen/X86/mul128.ll
+++ b/llvm/test/CodeGen/X86/mul128.ll
@@ -30,54 +30,55 @@ define i128 @foo(i128 %t, i128 %u) {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    imull %edx, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    imull %ecx, %ebp
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    imull %ebp, %edi
+; X86-NEXT:    imull %esi, %edi
 ; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    imull %ebp, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull %esi, %ecx
+; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    setb %cl
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movzbl %bl, %esi
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, (%ecx)

diff  --git a/llvm/test/CodeGen/X86/neg-abs.ll b/llvm/test/CodeGen/X86/neg-abs.ll
index 10896bb564c80..249ac1f72837e 100644
--- a/llvm/test/CodeGen/X86/neg-abs.ll
+++ b/llvm/test/CodeGen/X86/neg-abs.ll
@@ -113,27 +113,27 @@ define i128 @neg_abs_i128(i128 %x) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    xorl %ecx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    xorl %ecx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    xorl %ecx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    xorl %ecx, %ebx
+; X86-NEXT:    movl %ecx, %ebp
 ; X86-NEXT:    subl %ebx, %ebp
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %ecx, %ebx
 ; X86-NEXT:    sbbl %edi, %ebx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl %ebp, (%eax)
 ; X86-NEXT:    movl %ebx, 4(%eax)
 ; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/nontemporal.ll b/llvm/test/CodeGen/X86/nontemporal.ll
index 40f976bf4960a..863db837f7c09 100644
--- a/llvm/test/CodeGen/X86/nontemporal.ll
+++ b/llvm/test/CodeGen/X86/nontemporal.ll
@@ -59,31 +59,31 @@ define i32 @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4
 ; X86-AVX-NEXT:    vmovdqa 56(%ebp), %xmm4
 ; X86-AVX-NEXT:    vmovdqa 40(%ebp), %xmm5
 ; X86-AVX-NEXT:    vmovdqa 24(%ebp), %xmm6
-; X86-AVX-NEXT:    movl 8(%ebp), %edx
-; X86-AVX-NEXT:    movl 80(%ebp), %esi
-; X86-AVX-NEXT:    movl (%esi), %eax
+; X86-AVX-NEXT:    movl 8(%ebp), %esi
+; X86-AVX-NEXT:    movl 80(%ebp), %edx
+; X86-AVX-NEXT:    movl (%edx), %eax
 ; X86-AVX-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovntps %xmm0, (%edx)
+; X86-AVX-NEXT:    vmovntps %xmm0, (%esi)
 ; X86-AVX-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm0
-; X86-AVX-NEXT:    addl (%esi), %eax
-; X86-AVX-NEXT:    vmovntdq %xmm0, (%edx)
+; X86-AVX-NEXT:    addl (%edx), %eax
+; X86-AVX-NEXT:    vmovntdq %xmm0, (%esi)
 ; X86-AVX-NEXT:    vaddpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm0
-; X86-AVX-NEXT:    addl (%esi), %eax
-; X86-AVX-NEXT:    vmovntpd %xmm0, (%edx)
+; X86-AVX-NEXT:    addl (%edx), %eax
+; X86-AVX-NEXT:    vmovntpd %xmm0, (%esi)
 ; X86-AVX-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm6, %xmm0
-; X86-AVX-NEXT:    addl (%esi), %eax
-; X86-AVX-NEXT:    vmovntdq %xmm0, (%edx)
+; X86-AVX-NEXT:    addl (%edx), %eax
+; X86-AVX-NEXT:    vmovntdq %xmm0, (%esi)
 ; X86-AVX-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5, %xmm0
-; X86-AVX-NEXT:    addl (%esi), %eax
-; X86-AVX-NEXT:    vmovntdq %xmm0, (%edx)
+; X86-AVX-NEXT:    addl (%edx), %eax
+; X86-AVX-NEXT:    vmovntdq %xmm0, (%esi)
 ; X86-AVX-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4, %xmm0
-; X86-AVX-NEXT:    addl (%esi), %eax
-; X86-AVX-NEXT:    vmovntdq %xmm0, (%edx)
-; X86-AVX-NEXT:    addl (%esi), %eax
-; X86-AVX-NEXT:    movntil %ecx, (%edx)
-; X86-AVX-NEXT:    addl (%esi), %eax
-; X86-AVX-NEXT:    vmovsd %xmm3, (%edx)
-; X86-AVX-NEXT:    addl (%esi), %eax
+; X86-AVX-NEXT:    addl (%edx), %eax
+; X86-AVX-NEXT:    vmovntdq %xmm0, (%esi)
+; X86-AVX-NEXT:    addl (%edx), %eax
+; X86-AVX-NEXT:    movntil %ecx, (%esi)
+; X86-AVX-NEXT:    addl (%edx), %eax
+; X86-AVX-NEXT:    vmovsd %xmm3, (%esi)
+; X86-AVX-NEXT:    addl (%edx), %eax
 ; X86-AVX-NEXT:    leal -4(%ebp), %esp
 ; X86-AVX-NEXT:    popl %esi
 ; X86-AVX-NEXT:    popl %ebp

diff  --git a/llvm/test/CodeGen/X86/nosse-vector.ll b/llvm/test/CodeGen/X86/nosse-vector.ll
index 1203e2921f62f..419aee291aed2 100644
--- a/llvm/test/CodeGen/X86/nosse-vector.ll
+++ b/llvm/test/CodeGen/X86/nosse-vector.ll
@@ -144,20 +144,20 @@ define void @sitofp_4i64_4f32_mem(<4 x i64>* %p0, <4 x float>* %p1) nounwind {
 ; X32-NEXT:    pushl %esi
 ; X32-NEXT:    andl $-8, %esp
 ; X32-NEXT:    subl $48, %esp
-; X32-NEXT:    movl 8(%ebp), %eax
-; X32-NEXT:    movl 24(%eax), %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl 28(%eax), %ecx
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-NEXT:    movl 16(%eax), %esi
-; X32-NEXT:    movl 20(%eax), %edi
-; X32-NEXT:    movl 8(%eax), %ebx
-; X32-NEXT:    movl 12(%eax), %edx
-; X32-NEXT:    movl (%eax), %ecx
-; X32-NEXT:    movl 4(%eax), %eax
+; X32-NEXT:    movl 8(%ebp), %edx
+; X32-NEXT:    movl 24(%edx), %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 28(%edx), %eax
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl 16(%edx), %esi
+; X32-NEXT:    movl 20(%edx), %edi
+; X32-NEXT:    movl 8(%edx), %ebx
+; X32-NEXT:    movl 12(%edx), %ecx
+; X32-NEXT:    movl (%edx), %eax
+; X32-NEXT:    movl 4(%edx), %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)

diff  --git a/llvm/test/CodeGen/X86/overflow.ll b/llvm/test/CodeGen/X86/overflow.ll
index 6e7850068e889..069c55cd4e09c 100644
--- a/llvm/test/CodeGen/X86/overflow.ll
+++ b/llvm/test/CodeGen/X86/overflow.ll
@@ -9,11 +9,11 @@ define i128 @mulhioverflow(i64 %a, i64 %b, i64 %c) nounwind {
 ; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %ebp, %eax
@@ -22,15 +22,15 @@ define i128 @mulhioverflow(i64 %a, i64 %b, i64 %c) nounwind {
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl %edi, %ebp
 ; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    adcl %ebx, %ecx
+; X32-NEXT:    adcl %ebx, %esi
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    addl %ecx, %eax
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    addl %esi, %eax
 ; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx

diff  --git a/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll b/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
index 6333c418aea5d..1fd2cbe82df5c 100644
--- a/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
+++ b/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
@@ -276,8 +276,8 @@ define i64 @test_two_live_flags(i64* %foo0, i64 %bar0, i64 %baz0, i64* %foo1, i6
 ; CHECK32-NEXT:    pushl %edi
 ; CHECK32-NEXT:    pushl %esi
 ; CHECK32-NEXT:    pushl %eax
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
@@ -286,8 +286,8 @@ define i64 @test_two_live_flags(i64* %foo0, i64 %bar0, i64 %baz0, i64* %foo1, i6
 ; CHECK32-NEXT:    lock cmpxchg8b (%esi)
 ; CHECK32-NEXT:    setne {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    movl %edi, %edx
-; CHECK32-NEXT:    movl %ebp, %ecx
+; CHECK32-NEXT:    movl %ebp, %edx
+; CHECK32-NEXT:    movl %edi, %ecx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; CHECK32-NEXT:    lock cmpxchg8b (%esi)

diff  --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
index 09bab90813306..c1755479e4c69 100644
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -445,42 +445,42 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-SSE2-LABEL: cnt128:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrlw $1, %xmm1
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    psubb %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT:    pand %xmm1, %xmm3
-; X86-SSE2-NEXT:    psrlw $2, %xmm0
+; X86-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT:    psrlw $1, %xmm0
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
 ; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    paddb %xmm3, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT:    psrlw $4, %xmm3
-; X86-SSE2-NEXT:    paddb %xmm0, %xmm3
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X86-SSE2-NEXT:    psubb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm3
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm4
-; X86-SSE2-NEXT:    psadbw %xmm4, %xmm3
-; X86-SSE2-NEXT:    movd %xmm3, %ecx
-; X86-SSE2-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT:    psrlw $2, %xmm2
+; X86-SSE2-NEXT:    pand %xmm0, %xmm2
+; X86-SSE2-NEXT:    paddb %xmm3, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    psrlw $4, %xmm4
+; X86-SSE2-NEXT:    paddb %xmm2, %xmm4
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X86-SSE2-NEXT:    pand %xmm2, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
+; X86-SSE2-NEXT:    psadbw %xmm3, %xmm4
+; X86-SSE2-NEXT:    movd %xmm4, %ecx
+; X86-SSE2-NEXT:    movq {{.*#+}} xmm4 = mem[0],zero
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm5
 ; X86-SSE2-NEXT:    psrlw $1, %xmm5
-; X86-SSE2-NEXT:    pand %xmm2, %xmm5
-; X86-SSE2-NEXT:    psubb %xmm5, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    psrlw $2, %xmm3
-; X86-SSE2-NEXT:    pand %xmm1, %xmm3
-; X86-SSE2-NEXT:    paddb %xmm2, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
-; X86-SSE2-NEXT:    psrlw $4, %xmm1
-; X86-SSE2-NEXT:    paddb %xmm3, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm5
+; X86-SSE2-NEXT:    psubb %xmm5, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    psadbw %xmm4, %xmm1
-; X86-SSE2-NEXT:    movd %xmm1, %edx
+; X86-SSE2-NEXT:    psrlw $2, %xmm4
+; X86-SSE2-NEXT:    pand %xmm0, %xmm4
+; X86-SSE2-NEXT:    paddb %xmm1, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT:    psrlw $4, %xmm0
+; X86-SSE2-NEXT:    paddb %xmm4, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    psadbw %xmm3, %xmm0
+; X86-SSE2-NEXT:    movd %xmm0, %edx
 ; X86-SSE2-NEXT:    addl %ecx, %edx
 ; X86-SSE2-NEXT:    movl %edx, (%eax)
 ; X86-SSE2-NEXT:    movl $0, 12(%eax)
@@ -491,32 +491,32 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-SSSE3-LABEL: cnt128:
 ; X86-SSSE3:       # %bb.0:
 ; X86-SSSE3-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X86-SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X86-SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSSE3-NEXT:    pand %xmm0, %xmm2
-; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X86-SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; X86-SSSE3-NEXT:    pshufb %xmm2, %xmm4
-; X86-SSSE3-NEXT:    psrlw $4, %xmm1
-; X86-SSSE3-NEXT:    pand %xmm0, %xmm1
-; X86-SSSE3-NEXT:    movdqa %xmm3, %xmm2
-; X86-SSSE3-NEXT:    pshufb %xmm1, %xmm2
-; X86-SSSE3-NEXT:    paddb %xmm4, %xmm2
-; X86-SSSE3-NEXT:    pxor %xmm1, %xmm1
-; X86-SSSE3-NEXT:    psadbw %xmm1, %xmm2
-; X86-SSSE3-NEXT:    movd %xmm2, %ecx
+; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X86-SSSE3-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm4
-; X86-SSSE3-NEXT:    pand %xmm0, %xmm4
-; X86-SSSE3-NEXT:    movdqa %xmm3, %xmm5
-; X86-SSSE3-NEXT:    pshufb %xmm4, %xmm5
+; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; X86-SSSE3-NEXT:    pand %xmm1, %xmm3
+; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X86-SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSSE3-NEXT:    pshufb %xmm3, %xmm4
 ; X86-SSSE3-NEXT:    psrlw $4, %xmm2
-; X86-SSSE3-NEXT:    pand %xmm0, %xmm2
+; X86-SSSE3-NEXT:    pand %xmm1, %xmm2
+; X86-SSSE3-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSSE3-NEXT:    pshufb %xmm2, %xmm3
-; X86-SSSE3-NEXT:    paddb %xmm5, %xmm3
-; X86-SSSE3-NEXT:    psadbw %xmm1, %xmm3
-; X86-SSSE3-NEXT:    movd %xmm3, %edx
+; X86-SSSE3-NEXT:    paddb %xmm4, %xmm3
+; X86-SSSE3-NEXT:    pxor %xmm2, %xmm2
+; X86-SSSE3-NEXT:    psadbw %xmm2, %xmm3
+; X86-SSSE3-NEXT:    movd %xmm3, %ecx
+; X86-SSSE3-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
+; X86-SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; X86-SSSE3-NEXT:    pand %xmm1, %xmm4
+; X86-SSSE3-NEXT:    movdqa %xmm0, %xmm5
+; X86-SSSE3-NEXT:    pshufb %xmm4, %xmm5
+; X86-SSSE3-NEXT:    psrlw $4, %xmm3
+; X86-SSSE3-NEXT:    pand %xmm1, %xmm3
+; X86-SSSE3-NEXT:    pshufb %xmm3, %xmm0
+; X86-SSSE3-NEXT:    paddb %xmm5, %xmm0
+; X86-SSSE3-NEXT:    psadbw %xmm2, %xmm0
+; X86-SSSE3-NEXT:    movd %xmm0, %edx
 ; X86-SSSE3-NEXT:    addl %ecx, %edx
 ; X86-SSSE3-NEXT:    movl %edx, (%eax)
 ; X86-SSSE3-NEXT:    movl $0, 12(%eax)
@@ -667,41 +667,41 @@ define i64 @cnt64_optsize(i64 %x) nounwind readnone optsize {
 ; X86-NOSSE-NEXT:    pushl %edi
 ; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl %ecx, %edx
-; X86-NOSSE-NEXT:    shrl %edx
-; X86-NOSSE-NEXT:    movl $1431655765, %esi # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %esi, %edx
-; X86-NOSSE-NEXT:    subl %edx, %ecx
-; X86-NOSSE-NEXT:    movl $858993459, %edx # imm = 0x33333333
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    andl %edx, %edi
-; X86-NOSSE-NEXT:    shrl $2, %ecx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOSSE-NEXT:    movl %esi, %ecx
+; X86-NOSSE-NEXT:    shrl %ecx
+; X86-NOSSE-NEXT:    movl $1431655765, %edx # imm = 0x55555555
 ; X86-NOSSE-NEXT:    andl %edx, %ecx
-; X86-NOSSE-NEXT:    addl %edi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %ecx, %edi
-; X86-NOSSE-NEXT:    movl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    subl %ecx, %esi
+; X86-NOSSE-NEXT:    movl $858993459, %ecx # imm = 0x33333333
+; X86-NOSSE-NEXT:    movl %esi, %edi
 ; X86-NOSSE-NEXT:    andl %ecx, %edi
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edi
+; X86-NOSSE-NEXT:    shrl $2, %esi
+; X86-NOSSE-NEXT:    andl %ecx, %esi
+; X86-NOSSE-NEXT:    addl %edi, %esi
+; X86-NOSSE-NEXT:    movl %esi, %ebx
+; X86-NOSSE-NEXT:    shrl $4, %ebx
+; X86-NOSSE-NEXT:    addl %esi, %ebx
+; X86-NOSSE-NEXT:    movl $252645135, %edi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    andl %edi, %ebx
+; X86-NOSSE-NEXT:    imull $16843009, %ebx, %esi # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %esi
 ; X86-NOSSE-NEXT:    movl %eax, %ebx
 ; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl %esi, %ebx
+; X86-NOSSE-NEXT:    andl %edx, %ebx
 ; X86-NOSSE-NEXT:    subl %ebx, %eax
-; X86-NOSSE-NEXT:    movl %eax, %esi
-; X86-NOSSE-NEXT:    andl %edx, %esi
-; X86-NOSSE-NEXT:    shrl $2, %eax
-; X86-NOSSE-NEXT:    andl %edx, %eax
-; X86-NOSSE-NEXT:    addl %esi, %eax
 ; X86-NOSSE-NEXT:    movl %eax, %edx
-; X86-NOSSE-NEXT:    shrl $4, %edx
-; X86-NOSSE-NEXT:    addl %eax, %edx
 ; X86-NOSSE-NEXT:    andl %ecx, %edx
-; X86-NOSSE-NEXT:    imull $16843009, %edx, %eax # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $2, %eax
+; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    addl %edx, %eax
+; X86-NOSSE-NEXT:    movl %eax, %ecx
+; X86-NOSSE-NEXT:    shrl $4, %ecx
+; X86-NOSSE-NEXT:    addl %eax, %ecx
+; X86-NOSSE-NEXT:    andl %edi, %ecx
+; X86-NOSSE-NEXT:    imull $16843009, %ecx, %eax # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %eax
-; X86-NOSSE-NEXT:    addl %edi, %eax
+; X86-NOSSE-NEXT:    addl %esi, %eax
 ; X86-NOSSE-NEXT:    xorl %edx, %edx
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
@@ -845,35 +845,36 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-NOSSE-NEXT:    shrl $2, %esi
 ; X86-NOSSE-NEXT:    andl %ecx, %esi
 ; X86-NOSSE-NEXT:    addl %eax, %esi
-; X86-NOSSE-NEXT:    movl %esi, %eax
-; X86-NOSSE-NEXT:    shrl $4, %eax
-; X86-NOSSE-NEXT:    addl %esi, %eax
-; X86-NOSSE-NEXT:    movl %edx, %esi
-; X86-NOSSE-NEXT:    shrl %esi
-; X86-NOSSE-NEXT:    andl %ebp, %esi
-; X86-NOSSE-NEXT:    subl %esi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %esi
-; X86-NOSSE-NEXT:    andl %ecx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %ebp
+; X86-NOSSE-NEXT:    shrl $4, %ebp
+; X86-NOSSE-NEXT:    addl %esi, %ebp
+; X86-NOSSE-NEXT:    movl %edx, %eax
+; X86-NOSSE-NEXT:    shrl %eax
+; X86-NOSSE-NEXT:    movl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %esi, %eax
+; X86-NOSSE-NEXT:    subl %eax, %edx
+; X86-NOSSE-NEXT:    movl %edx, %eax
+; X86-NOSSE-NEXT:    andl %ecx, %eax
 ; X86-NOSSE-NEXT:    shrl $2, %edx
 ; X86-NOSSE-NEXT:    andl %ecx, %edx
-; X86-NOSSE-NEXT:    addl %esi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %ecx
-; X86-NOSSE-NEXT:    shrl $4, %ecx
-; X86-NOSSE-NEXT:    addl %edx, %ecx
+; X86-NOSSE-NEXT:    addl %eax, %edx
+; X86-NOSSE-NEXT:    movl %edx, %eax
+; X86-NOSSE-NEXT:    shrl $4, %eax
+; X86-NOSSE-NEXT:    addl %edx, %eax
+; X86-NOSSE-NEXT:    andl %ebx, %ebp
 ; X86-NOSSE-NEXT:    andl %ebx, %eax
-; X86-NOSSE-NEXT:    andl %ebx, %ecx
-; X86-NOSSE-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %eax
-; X86-NOSSE-NEXT:    imull $16843009, %ecx, %ecx # imm = 0x1010101
+; X86-NOSSE-NEXT:    imull $16843009, %ebp, %ecx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %ecx
-; X86-NOSSE-NEXT:    addl %eax, %ecx
+; X86-NOSSE-NEXT:    imull $16843009, %eax, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
+; X86-NOSSE-NEXT:    addl %ecx, %edx
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    addl %edi, %ecx
-; X86-NOSSE-NEXT:    xorl %edx, %edx
-; X86-NOSSE-NEXT:    movl %edx, 12(%eax)
-; X86-NOSSE-NEXT:    movl %edx, 8(%eax)
-; X86-NOSSE-NEXT:    movl %edx, 4(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    addl %edi, %edx
+; X86-NOSSE-NEXT:    xorl %ecx, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, 12(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, 8(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, 4(%eax)
+; X86-NOSSE-NEXT:    movl %edx, (%eax)
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
 ; X86-NOSSE-NEXT:    popl %ebx
@@ -950,42 +951,42 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-SSE2-LABEL: cnt128_optsize:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrlw $1, %xmm1
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    psubb %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT:    pand %xmm1, %xmm3
-; X86-SSE2-NEXT:    psrlw $2, %xmm0
+; X86-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT:    psrlw $1, %xmm0
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
 ; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    paddb %xmm3, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT:    psrlw $4, %xmm3
-; X86-SSE2-NEXT:    paddb %xmm0, %xmm3
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X86-SSE2-NEXT:    psubb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm3
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm4
-; X86-SSE2-NEXT:    psadbw %xmm4, %xmm3
-; X86-SSE2-NEXT:    movd %xmm3, %ecx
-; X86-SSE2-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT:    psrlw $2, %xmm2
+; X86-SSE2-NEXT:    pand %xmm0, %xmm2
+; X86-SSE2-NEXT:    paddb %xmm3, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    psrlw $4, %xmm4
+; X86-SSE2-NEXT:    paddb %xmm2, %xmm4
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X86-SSE2-NEXT:    pand %xmm2, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
+; X86-SSE2-NEXT:    psadbw %xmm3, %xmm4
+; X86-SSE2-NEXT:    movd %xmm4, %ecx
+; X86-SSE2-NEXT:    movq {{.*#+}} xmm4 = mem[0],zero
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm5
 ; X86-SSE2-NEXT:    psrlw $1, %xmm5
-; X86-SSE2-NEXT:    pand %xmm2, %xmm5
-; X86-SSE2-NEXT:    psubb %xmm5, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    psrlw $2, %xmm3
-; X86-SSE2-NEXT:    pand %xmm1, %xmm3
-; X86-SSE2-NEXT:    paddb %xmm2, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
-; X86-SSE2-NEXT:    psrlw $4, %xmm1
-; X86-SSE2-NEXT:    paddb %xmm3, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm5
+; X86-SSE2-NEXT:    psubb %xmm5, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    psadbw %xmm4, %xmm1
-; X86-SSE2-NEXT:    movd %xmm1, %edx
+; X86-SSE2-NEXT:    psrlw $2, %xmm4
+; X86-SSE2-NEXT:    pand %xmm0, %xmm4
+; X86-SSE2-NEXT:    paddb %xmm1, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT:    psrlw $4, %xmm0
+; X86-SSE2-NEXT:    paddb %xmm4, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    psadbw %xmm3, %xmm0
+; X86-SSE2-NEXT:    movd %xmm0, %edx
 ; X86-SSE2-NEXT:    addl %ecx, %edx
 ; X86-SSE2-NEXT:    xorl %ecx, %ecx
 ; X86-SSE2-NEXT:    movl %ecx, 12(%eax)
@@ -997,32 +998,32 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-SSSE3-LABEL: cnt128_optsize:
 ; X86-SSSE3:       # %bb.0:
 ; X86-SSSE3-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X86-SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X86-SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSSE3-NEXT:    pand %xmm0, %xmm2
-; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X86-SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; X86-SSSE3-NEXT:    pshufb %xmm2, %xmm4
-; X86-SSSE3-NEXT:    psrlw $4, %xmm1
-; X86-SSSE3-NEXT:    pand %xmm0, %xmm1
-; X86-SSSE3-NEXT:    movdqa %xmm3, %xmm2
-; X86-SSSE3-NEXT:    pshufb %xmm1, %xmm2
-; X86-SSSE3-NEXT:    paddb %xmm4, %xmm2
-; X86-SSSE3-NEXT:    pxor %xmm1, %xmm1
-; X86-SSSE3-NEXT:    psadbw %xmm1, %xmm2
-; X86-SSSE3-NEXT:    movd %xmm2, %ecx
+; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X86-SSSE3-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm4
-; X86-SSSE3-NEXT:    pand %xmm0, %xmm4
-; X86-SSSE3-NEXT:    movdqa %xmm3, %xmm5
-; X86-SSSE3-NEXT:    pshufb %xmm4, %xmm5
+; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; X86-SSSE3-NEXT:    pand %xmm1, %xmm3
+; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X86-SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSSE3-NEXT:    pshufb %xmm3, %xmm4
 ; X86-SSSE3-NEXT:    psrlw $4, %xmm2
-; X86-SSSE3-NEXT:    pand %xmm0, %xmm2
+; X86-SSSE3-NEXT:    pand %xmm1, %xmm2
+; X86-SSSE3-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSSE3-NEXT:    pshufb %xmm2, %xmm3
-; X86-SSSE3-NEXT:    paddb %xmm5, %xmm3
-; X86-SSSE3-NEXT:    psadbw %xmm1, %xmm3
-; X86-SSSE3-NEXT:    movd %xmm3, %edx
+; X86-SSSE3-NEXT:    paddb %xmm4, %xmm3
+; X86-SSSE3-NEXT:    pxor %xmm2, %xmm2
+; X86-SSSE3-NEXT:    psadbw %xmm2, %xmm3
+; X86-SSSE3-NEXT:    movd %xmm3, %ecx
+; X86-SSSE3-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
+; X86-SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; X86-SSSE3-NEXT:    pand %xmm1, %xmm4
+; X86-SSSE3-NEXT:    movdqa %xmm0, %xmm5
+; X86-SSSE3-NEXT:    pshufb %xmm4, %xmm5
+; X86-SSSE3-NEXT:    psrlw $4, %xmm3
+; X86-SSSE3-NEXT:    pand %xmm1, %xmm3
+; X86-SSSE3-NEXT:    pshufb %xmm3, %xmm0
+; X86-SSSE3-NEXT:    paddb %xmm5, %xmm0
+; X86-SSSE3-NEXT:    psadbw %xmm2, %xmm0
+; X86-SSSE3-NEXT:    movd %xmm0, %edx
 ; X86-SSSE3-NEXT:    addl %ecx, %edx
 ; X86-SSSE3-NEXT:    xorl %ecx, %ecx
 ; X86-SSSE3-NEXT:    movl %ecx, 12(%eax)
@@ -1096,41 +1097,41 @@ define i64 @cnt64_pgso(i64 %x) nounwind readnone !prof !14 {
 ; X86-NOSSE-NEXT:    pushl %edi
 ; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl %ecx, %edx
-; X86-NOSSE-NEXT:    shrl %edx
-; X86-NOSSE-NEXT:    movl $1431655765, %esi # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %esi, %edx
-; X86-NOSSE-NEXT:    subl %edx, %ecx
-; X86-NOSSE-NEXT:    movl $858993459, %edx # imm = 0x33333333
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    andl %edx, %edi
-; X86-NOSSE-NEXT:    shrl $2, %ecx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOSSE-NEXT:    movl %esi, %ecx
+; X86-NOSSE-NEXT:    shrl %ecx
+; X86-NOSSE-NEXT:    movl $1431655765, %edx # imm = 0x55555555
 ; X86-NOSSE-NEXT:    andl %edx, %ecx
-; X86-NOSSE-NEXT:    addl %edi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %ecx, %edi
-; X86-NOSSE-NEXT:    movl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    subl %ecx, %esi
+; X86-NOSSE-NEXT:    movl $858993459, %ecx # imm = 0x33333333
+; X86-NOSSE-NEXT:    movl %esi, %edi
 ; X86-NOSSE-NEXT:    andl %ecx, %edi
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edi
+; X86-NOSSE-NEXT:    shrl $2, %esi
+; X86-NOSSE-NEXT:    andl %ecx, %esi
+; X86-NOSSE-NEXT:    addl %edi, %esi
+; X86-NOSSE-NEXT:    movl %esi, %ebx
+; X86-NOSSE-NEXT:    shrl $4, %ebx
+; X86-NOSSE-NEXT:    addl %esi, %ebx
+; X86-NOSSE-NEXT:    movl $252645135, %edi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    andl %edi, %ebx
+; X86-NOSSE-NEXT:    imull $16843009, %ebx, %esi # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %esi
 ; X86-NOSSE-NEXT:    movl %eax, %ebx
 ; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl %esi, %ebx
+; X86-NOSSE-NEXT:    andl %edx, %ebx
 ; X86-NOSSE-NEXT:    subl %ebx, %eax
-; X86-NOSSE-NEXT:    movl %eax, %esi
-; X86-NOSSE-NEXT:    andl %edx, %esi
-; X86-NOSSE-NEXT:    shrl $2, %eax
-; X86-NOSSE-NEXT:    andl %edx, %eax
-; X86-NOSSE-NEXT:    addl %esi, %eax
 ; X86-NOSSE-NEXT:    movl %eax, %edx
-; X86-NOSSE-NEXT:    shrl $4, %edx
-; X86-NOSSE-NEXT:    addl %eax, %edx
 ; X86-NOSSE-NEXT:    andl %ecx, %edx
-; X86-NOSSE-NEXT:    imull $16843009, %edx, %eax # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $2, %eax
+; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    addl %edx, %eax
+; X86-NOSSE-NEXT:    movl %eax, %ecx
+; X86-NOSSE-NEXT:    shrl $4, %ecx
+; X86-NOSSE-NEXT:    addl %eax, %ecx
+; X86-NOSSE-NEXT:    andl %edi, %ecx
+; X86-NOSSE-NEXT:    imull $16843009, %ecx, %eax # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %eax
-; X86-NOSSE-NEXT:    addl %edi, %eax
+; X86-NOSSE-NEXT:    addl %esi, %eax
 ; X86-NOSSE-NEXT:    xorl %edx, %edx
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
@@ -1274,35 +1275,36 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-NOSSE-NEXT:    shrl $2, %esi
 ; X86-NOSSE-NEXT:    andl %ecx, %esi
 ; X86-NOSSE-NEXT:    addl %eax, %esi
-; X86-NOSSE-NEXT:    movl %esi, %eax
-; X86-NOSSE-NEXT:    shrl $4, %eax
-; X86-NOSSE-NEXT:    addl %esi, %eax
-; X86-NOSSE-NEXT:    movl %edx, %esi
-; X86-NOSSE-NEXT:    shrl %esi
-; X86-NOSSE-NEXT:    andl %ebp, %esi
-; X86-NOSSE-NEXT:    subl %esi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %esi
-; X86-NOSSE-NEXT:    andl %ecx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %ebp
+; X86-NOSSE-NEXT:    shrl $4, %ebp
+; X86-NOSSE-NEXT:    addl %esi, %ebp
+; X86-NOSSE-NEXT:    movl %edx, %eax
+; X86-NOSSE-NEXT:    shrl %eax
+; X86-NOSSE-NEXT:    movl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %esi, %eax
+; X86-NOSSE-NEXT:    subl %eax, %edx
+; X86-NOSSE-NEXT:    movl %edx, %eax
+; X86-NOSSE-NEXT:    andl %ecx, %eax
 ; X86-NOSSE-NEXT:    shrl $2, %edx
 ; X86-NOSSE-NEXT:    andl %ecx, %edx
-; X86-NOSSE-NEXT:    addl %esi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %ecx
-; X86-NOSSE-NEXT:    shrl $4, %ecx
-; X86-NOSSE-NEXT:    addl %edx, %ecx
+; X86-NOSSE-NEXT:    addl %eax, %edx
+; X86-NOSSE-NEXT:    movl %edx, %eax
+; X86-NOSSE-NEXT:    shrl $4, %eax
+; X86-NOSSE-NEXT:    addl %edx, %eax
+; X86-NOSSE-NEXT:    andl %ebx, %ebp
 ; X86-NOSSE-NEXT:    andl %ebx, %eax
-; X86-NOSSE-NEXT:    andl %ebx, %ecx
-; X86-NOSSE-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %eax
-; X86-NOSSE-NEXT:    imull $16843009, %ecx, %ecx # imm = 0x1010101
+; X86-NOSSE-NEXT:    imull $16843009, %ebp, %ecx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %ecx
-; X86-NOSSE-NEXT:    addl %eax, %ecx
+; X86-NOSSE-NEXT:    imull $16843009, %eax, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
+; X86-NOSSE-NEXT:    addl %ecx, %edx
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    addl %edi, %ecx
-; X86-NOSSE-NEXT:    xorl %edx, %edx
-; X86-NOSSE-NEXT:    movl %edx, 12(%eax)
-; X86-NOSSE-NEXT:    movl %edx, 8(%eax)
-; X86-NOSSE-NEXT:    movl %edx, 4(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    addl %edi, %edx
+; X86-NOSSE-NEXT:    xorl %ecx, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, 12(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, 8(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, 4(%eax)
+; X86-NOSSE-NEXT:    movl %edx, (%eax)
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
 ; X86-NOSSE-NEXT:    popl %ebx
@@ -1379,42 +1381,42 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-SSE2-LABEL: cnt128_pgso:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrlw $1, %xmm1
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    psubb %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT:    pand %xmm1, %xmm3
-; X86-SSE2-NEXT:    psrlw $2, %xmm0
+; X86-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT:    psrlw $1, %xmm0
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
 ; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    paddb %xmm3, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT:    psrlw $4, %xmm3
-; X86-SSE2-NEXT:    paddb %xmm0, %xmm3
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X86-SSE2-NEXT:    psubb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm3
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm4
-; X86-SSE2-NEXT:    psadbw %xmm4, %xmm3
-; X86-SSE2-NEXT:    movd %xmm3, %ecx
-; X86-SSE2-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT:    psrlw $2, %xmm2
+; X86-SSE2-NEXT:    pand %xmm0, %xmm2
+; X86-SSE2-NEXT:    paddb %xmm3, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    psrlw $4, %xmm4
+; X86-SSE2-NEXT:    paddb %xmm2, %xmm4
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X86-SSE2-NEXT:    pand %xmm2, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
+; X86-SSE2-NEXT:    psadbw %xmm3, %xmm4
+; X86-SSE2-NEXT:    movd %xmm4, %ecx
+; X86-SSE2-NEXT:    movq {{.*#+}} xmm4 = mem[0],zero
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm5
 ; X86-SSE2-NEXT:    psrlw $1, %xmm5
-; X86-SSE2-NEXT:    pand %xmm2, %xmm5
-; X86-SSE2-NEXT:    psubb %xmm5, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    psrlw $2, %xmm3
-; X86-SSE2-NEXT:    pand %xmm1, %xmm3
-; X86-SSE2-NEXT:    paddb %xmm2, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
-; X86-SSE2-NEXT:    psrlw $4, %xmm1
-; X86-SSE2-NEXT:    paddb %xmm3, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm5
+; X86-SSE2-NEXT:    psubb %xmm5, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    psadbw %xmm4, %xmm1
-; X86-SSE2-NEXT:    movd %xmm1, %edx
+; X86-SSE2-NEXT:    psrlw $2, %xmm4
+; X86-SSE2-NEXT:    pand %xmm0, %xmm4
+; X86-SSE2-NEXT:    paddb %xmm1, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT:    psrlw $4, %xmm0
+; X86-SSE2-NEXT:    paddb %xmm4, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    psadbw %xmm3, %xmm0
+; X86-SSE2-NEXT:    movd %xmm0, %edx
 ; X86-SSE2-NEXT:    addl %ecx, %edx
 ; X86-SSE2-NEXT:    xorl %ecx, %ecx
 ; X86-SSE2-NEXT:    movl %ecx, 12(%eax)
@@ -1426,32 +1428,32 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-SSSE3-LABEL: cnt128_pgso:
 ; X86-SSSE3:       # %bb.0:
 ; X86-SSSE3-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X86-SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X86-SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSSE3-NEXT:    pand %xmm0, %xmm2
-; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X86-SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; X86-SSSE3-NEXT:    pshufb %xmm2, %xmm4
-; X86-SSSE3-NEXT:    psrlw $4, %xmm1
-; X86-SSSE3-NEXT:    pand %xmm0, %xmm1
-; X86-SSSE3-NEXT:    movdqa %xmm3, %xmm2
-; X86-SSSE3-NEXT:    pshufb %xmm1, %xmm2
-; X86-SSSE3-NEXT:    paddb %xmm4, %xmm2
-; X86-SSSE3-NEXT:    pxor %xmm1, %xmm1
-; X86-SSSE3-NEXT:    psadbw %xmm1, %xmm2
-; X86-SSSE3-NEXT:    movd %xmm2, %ecx
+; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X86-SSSE3-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm4
-; X86-SSSE3-NEXT:    pand %xmm0, %xmm4
-; X86-SSSE3-NEXT:    movdqa %xmm3, %xmm5
-; X86-SSSE3-NEXT:    pshufb %xmm4, %xmm5
+; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; X86-SSSE3-NEXT:    pand %xmm1, %xmm3
+; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X86-SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSSE3-NEXT:    pshufb %xmm3, %xmm4
 ; X86-SSSE3-NEXT:    psrlw $4, %xmm2
-; X86-SSSE3-NEXT:    pand %xmm0, %xmm2
+; X86-SSSE3-NEXT:    pand %xmm1, %xmm2
+; X86-SSSE3-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSSE3-NEXT:    pshufb %xmm2, %xmm3
-; X86-SSSE3-NEXT:    paddb %xmm5, %xmm3
-; X86-SSSE3-NEXT:    psadbw %xmm1, %xmm3
-; X86-SSSE3-NEXT:    movd %xmm3, %edx
+; X86-SSSE3-NEXT:    paddb %xmm4, %xmm3
+; X86-SSSE3-NEXT:    pxor %xmm2, %xmm2
+; X86-SSSE3-NEXT:    psadbw %xmm2, %xmm3
+; X86-SSSE3-NEXT:    movd %xmm3, %ecx
+; X86-SSSE3-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
+; X86-SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; X86-SSSE3-NEXT:    pand %xmm1, %xmm4
+; X86-SSSE3-NEXT:    movdqa %xmm0, %xmm5
+; X86-SSSE3-NEXT:    pshufb %xmm4, %xmm5
+; X86-SSSE3-NEXT:    psrlw $4, %xmm3
+; X86-SSSE3-NEXT:    pand %xmm1, %xmm3
+; X86-SSSE3-NEXT:    pshufb %xmm3, %xmm0
+; X86-SSSE3-NEXT:    paddb %xmm5, %xmm0
+; X86-SSSE3-NEXT:    psadbw %xmm2, %xmm0
+; X86-SSSE3-NEXT:    movd %xmm0, %edx
 ; X86-SSSE3-NEXT:    addl %ecx, %edx
 ; X86-SSSE3-NEXT:    xorl %ecx, %ecx
 ; X86-SSSE3-NEXT:    movl %ecx, 12(%eax)

diff  --git a/llvm/test/CodeGen/X86/pr31088.ll b/llvm/test/CodeGen/X86/pr31088.ll
index 7358db8a88a03..555f769b316fe 100644
--- a/llvm/test/CodeGen/X86/pr31088.ll
+++ b/llvm/test/CodeGen/X86/pr31088.ll
@@ -132,18 +132,18 @@ define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind {
 ; X64-NEXT:    pushq %r14
 ; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    subq $32, %rsp
-; X64-NEXT:    movl %edx, %ebx
-; X64-NEXT:    movl %esi, %ebp
+; X64-NEXT:    movl %edx, %ebp
+; X64-NEXT:    movl %esi, %ebx
 ; X64-NEXT:    movl %edi, %r14d
 ; X64-NEXT:    movzwl %cx, %edi
 ; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; X64-NEXT:    movzwl %bp, %edi
+; X64-NEXT:    movzwl %bx, %edi
 ; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
 ; X64-NEXT:    callq __gnu_f2h_ieee at PLT
 ; X64-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
-; X64-NEXT:    movzwl %bx, %edi
+; X64-NEXT:    movzwl %bp, %edi
 ; X64-NEXT:    callq __gnu_h2f_ieee at PLT
 ; X64-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    movzwl %r14w, %edi

diff  --git a/llvm/test/CodeGen/X86/pr32284.ll b/llvm/test/CodeGen/X86/pr32284.ll
index 867711810914b..b27415f94fa76 100644
--- a/llvm/test/CodeGen/X86/pr32284.ll
+++ b/llvm/test/CodeGen/X86/pr32284.ll
@@ -213,41 +213,46 @@ define void @f1() {
 ;
 ; X86-LABEL: f1:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    subl $1, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 9
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl var_5, %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    xorl $208307499, %eax # imm = 0xC6A852B
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    xorl $-2, %ecx
-; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    .cfi_def_cfa_offset 13
+; X86-NEXT:    .cfi_offset %esi, -12
+; X86-NEXT:    .cfi_offset %ebx, -8
+; X86-NEXT:    movl var_5, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    xorl $208307499, %edx # imm = 0xC6A852B
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    xorl $-2, %esi
+; X86-NEXT:    orl %edx, %esi
 ; X86-NEXT:    setne (%esp)
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    andl %esi, %ecx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl $-1, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    cmpl $-1, %edx
-; X86-NEXT:    sete %cl
-; X86-NEXT:    addl $7093, %edx # imm = 0x1BB5
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    cmpl %ecx, %edx
-; X86-NEXT:    sbbl $0, %esi
-; X86-NEXT:    setl %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl %ecx, var_57
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    andl %ecx, %esi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl $-1, %esi
+; X86-NEXT:    sete %dl
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    cmpl $-1, %eax
+; X86-NEXT:    sete %bl
+; X86-NEXT:    addl $7093, %eax # imm = 0x1BB5
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    cmpl %ebx, %eax
+; X86-NEXT:    sbbl $0, %ecx
+; X86-NEXT:    setl %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl %eax, var_57
 ; X86-NEXT:    movl $0, var_57+4
-; X86-NEXT:    movl %eax, _ZN8struct_210member_2_0E
+; X86-NEXT:    movl %edx, _ZN8struct_210member_2_0E
 ; X86-NEXT:    movl $0, _ZN8struct_210member_2_0E+4
 ; X86-NEXT:    addl $1, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 entry:

diff  --git a/llvm/test/CodeGen/X86/pr32329.ll b/llvm/test/CodeGen/X86/pr32329.ll
index ac5859fb2c7b1..8fa78f1e0b2dd 100644
--- a/llvm/test/CodeGen/X86/pr32329.ll
+++ b/llvm/test/CodeGen/X86/pr32329.ll
@@ -30,29 +30,29 @@ define void @foo() local_unnamed_addr {
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movsbl var_27, %eax
-; X86-NEXT:    movzwl var_2, %esi
+; X86-NEXT:    movzwl var_2, %ebx
 ; X86-NEXT:    movl var_310, %ecx
 ; X86-NEXT:    imull %eax, %ecx
 ; X86-NEXT:    addl var_24, %ecx
-; X86-NEXT:    movl $4194303, %edi # imm = 0x3FFFFF
-; X86-NEXT:    andl obj, %edi
-; X86-NEXT:    leal (%edi,%edi), %edx
+; X86-NEXT:    movl $4194303, %esi # imm = 0x3FFFFF
+; X86-NEXT:    andl obj, %esi
+; X86-NEXT:    leal (%esi,%esi), %edx
 ; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    subl %esi, %ebx
-; X86-NEXT:    imull %ebx, %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    subl %ebx, %edi
+; X86-NEXT:    imull %edi, %ecx
 ; X86-NEXT:    addb $113, %cl
-; X86-NEXT:    movl $9, %esi
+; X86-NEXT:    movl $9, %ebx
 ; X86-NEXT:    xorl %ebp, %ebp
-; X86-NEXT:    shldl %cl, %esi, %ebp
-; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    shldl %cl, %ebx, %ebp
+; X86-NEXT:    shll %cl, %ebx
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    cmovnel %esi, %ebp
+; X86-NEXT:    cmovnel %ebx, %ebp
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    cmovnel %ecx, %esi
-; X86-NEXT:    cmpl %edi, %ebx
+; X86-NEXT:    cmovnel %ecx, %ebx
+; X86-NEXT:    cmpl %esi, %edi
 ; X86-NEXT:    movl %ebp, var_50+4
-; X86-NEXT:    movl %esi, var_50
+; X86-NEXT:    movl %ebx, var_50
 ; X86-NEXT:    setge var_205
 ; X86-NEXT:    imull %eax, %edx
 ; X86-NEXT:    movb %dl, var_218

diff  --git a/llvm/test/CodeGen/X86/pr32610.ll b/llvm/test/CodeGen/X86/pr32610.ll
index 0386dbf4bc759..3d1195fff6988 100644
--- a/llvm/test/CodeGen/X86/pr32610.ll
+++ b/llvm/test/CodeGen/X86/pr32610.ll
@@ -13,20 +13,20 @@ define void @pr32610(i32 %a0, i32 %a1) #0 {
 ; CHECK-NEXT:    pushl %ebp
 ; CHECK-NEXT:    movl %esp, %ebp
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    movl 8(%ebp), %ecx
-; CHECK-NEXT:    movl L_b$non_lazy_ptr, %edx
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    cmpl (%edx), %ecx
-; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    movl 8(%ebp), %edx
+; CHECK-NEXT:    movl L_b$non_lazy_ptr, %eax
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    cmpl (%eax), %edx
+; CHECK-NEXT:    sete %cl
 ; CHECK-NEXT:    xorl %esi, %esi
 ; CHECK-NEXT:    incl %esi
 ; CHECK-NEXT:    cmpl $0, 12(%ebp)
-; CHECK-NEXT:    cmovel %esi, %eax
-; CHECK-NEXT:    cmpl (%edx), %ecx
-; CHECK-NEXT:    cmovnel %esi, %eax
-; CHECK-NEXT:    movl L_c$non_lazy_ptr, %ecx
-; CHECK-NEXT:    movl %eax, (%ecx)
-; CHECK-NEXT:    movl (%edx), %eax
+; CHECK-NEXT:    cmovel %esi, %ecx
+; CHECK-NEXT:    cmpl (%eax), %edx
+; CHECK-NEXT:    cmovnel %esi, %ecx
+; CHECK-NEXT:    movl L_c$non_lazy_ptr, %edx
+; CHECK-NEXT:    movl %ecx, (%edx)
+; CHECK-NEXT:    movl (%eax), %eax
 ; CHECK-NEXT:    testl %eax, %eax
 ; CHECK-NEXT:    movl $2, %ecx
 ; CHECK-NEXT:    cmovnel %eax, %ecx

diff  --git a/llvm/test/CodeGen/X86/pr34080-2.ll b/llvm/test/CodeGen/X86/pr34080-2.ll
index ad7100284b044..d5763a68c27c5 100644
--- a/llvm/test/CodeGen/X86/pr34080-2.ll
+++ b/llvm/test/CodeGen/X86/pr34080-2.ll
@@ -58,26 +58,26 @@ define void @computeJD(%struct.DateTime*) nounwind {
 ; CHECK-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movb $1, 36(%ebx)
-; CHECK-NEXT:    imull $3600000, 20(%ebx), %eax # imm = 0x36EE80
-; CHECK-NEXT:    imull $60000, 24(%ebx), %ecx # imm = 0xEA60
-; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    imull $3600000, 20(%ebx), %ecx # imm = 0x36EE80
+; CHECK-NEXT:    imull $60000, 24(%ebx), %eax # imm = 0xEA60
+; CHECK-NEXT:    addl %ecx, %eax
 ; CHECK-NEXT:    fldl 28(%ebx)
 ; CHECK-NEXT:    fmuls {{\.?LCPI[0-9]+_[0-9]+}}
 ; CHECK-NEXT:    fnstcw {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    orl $3072, %eax # imm = 0xC00
-; CHECK-NEXT:    movw %ax, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %ecx, %eax
-; CHECK-NEXT:    sarl $31, %eax
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    orl $3072, %ecx # imm = 0xC00
+; CHECK-NEXT:    movw %cx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    sarl $31, %ecx
 ; CHECK-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fldcw {{[0-9]+}}(%esp)
-; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    adcl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    adcl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl %ecx, (%ebx)
-; CHECK-NEXT:    movl %eax, 4(%ebx)
+; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl %eax, (%ebx)
+; CHECK-NEXT:    movl %ecx, 4(%ebx)
 ; CHECK-NEXT:    leal -12(%ebp), %esp
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    popl %edi

diff  --git a/llvm/test/CodeGen/X86/pr46527.ll b/llvm/test/CodeGen/X86/pr46527.ll
index 65f6d4d81b062..0d5ae2ae4d8c3 100644
--- a/llvm/test/CodeGen/X86/pr46527.ll
+++ b/llvm/test/CodeGen/X86/pr46527.ll
@@ -7,11 +7,11 @@ define void @f(<16 x i8>* %out, <16 x i8> %in, i1 %flag) {
 ; CHECK-NEXT:    calll .L0$pb
 ; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
 ; CHECK-NEXT:  .L0$pb:
-; CHECK-NEXT:    popl %eax
+; CHECK-NEXT:    popl %ecx
 ; CHECK-NEXT:    .cfi_adjust_cfa_offset -4
 ; CHECK-NEXT:  .Ltmp0:
-; CHECK-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; CHECK-NEXT:    notb %dl
 ; CHECK-NEXT:    andb $1, %dl
@@ -22,8 +22,8 @@ define void @f(<16 x i8>* %out, <16 x i8> %in, i1 %flag) {
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; CHECK-NEXT:    paddb %xmm1, %xmm1
 ; CHECK-NEXT:    pxor %xmm0, %xmm1
-; CHECK-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}@GOTOFF(%eax), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, (%ecx)
+; CHECK-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}@GOTOFF(%ecx), %xmm1
+; CHECK-NEXT:    movdqa %xmm1, (%eax)
 ; CHECK-NEXT:    retl
 entry:
   %0 = select i1 %flag, i8 0, i8 2

diff  --git a/llvm/test/CodeGen/X86/sadd_sat.ll b/llvm/test/CodeGen/X86/sadd_sat.ll
index e839c5e18bc3f..a27988d787159 100644
--- a/llvm/test/CodeGen/X86/sadd_sat.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat.ll
@@ -177,30 +177,30 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    cmovol %esi, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal (%edx,%eax), %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    xorl $-2147483648, %esi # imm = 0x80000000
+; X86-NEXT:    leal (%edx,%eax), %edi
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    xorl $-2147483648, %edi # imm = 0x80000000
 ; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmovol %edi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%esi,%eax), %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    xorl $-2147483648, %ebx # imm = 0x80000000
+; X86-NEXT:    addl %eax, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovol %esi, %edx
+; X86-NEXT:    cmovol %ebx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal (%edi,%eax), %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    xorl $-2147483648, %esi # imm = 0x80000000
+; X86-NEXT:    leal (%edi,%eax), %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    xorl $-2147483648, %ebx # imm = 0x80000000
 ; X86-NEXT:    addl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmovol %esi, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal (%ebx,%eax), %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    xorl $-2147483648, %esi # imm = 0x80000000
-; X86-NEXT:    addl %eax, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmovol %esi, %ebx
+; X86-NEXT:    cmovol %ebx, %edi
 ; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
index c683c3e0a345c..e6115ab0b2ba1 100644
--- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
@@ -76,14 +76,14 @@ define i256 @test1(i256 %a) nounwind {
 ; HYBRID-NEXT:    movl %r10d, %ecx
 ; HYBRID-NEXT:    shldq %cl, %rdi, %r11
 ; HYBRID-NEXT:    addb $-125, %sil
-; HYBRID-NEXT:    xorl %edx, %edx
+; HYBRID-NEXT:    xorl %ebx, %ebx
 ; HYBRID-NEXT:    movl %esi, %ecx
-; HYBRID-NEXT:    shldq %cl, %rdi, %rdx
-; HYBRID-NEXT:    movl $1, %ebx
-; HYBRID-NEXT:    shlq %cl, %rbx
+; HYBRID-NEXT:    shldq %cl, %rdi, %rbx
+; HYBRID-NEXT:    movl $1, %edx
+; HYBRID-NEXT:    shlq %cl, %rdx
 ; HYBRID-NEXT:    testb $64, %sil
-; HYBRID-NEXT:    cmovneq %rbx, %rdx
-; HYBRID-NEXT:    cmovneq %r8, %rbx
+; HYBRID-NEXT:    cmovneq %rdx, %rbx
+; HYBRID-NEXT:    cmovneq %r8, %rdx
 ; HYBRID-NEXT:    movl %r10d, %ecx
 ; HYBRID-NEXT:    shlq %cl, %rdi
 ; HYBRID-NEXT:    testb $64, %r10b
@@ -94,12 +94,12 @@ define i256 @test1(i256 %a) nounwind {
 ; HYBRID-NEXT:    movq %r11, 8(%rax)
 ; HYBRID-NEXT:    cmovsq %r8, %rdi
 ; HYBRID-NEXT:    movq %rdi, (%rax)
-; HYBRID-NEXT:    cmovnsq %r8, %rdx
-; HYBRID-NEXT:    cmoveq %r8, %rdx
-; HYBRID-NEXT:    movq %rdx, 24(%rax)
-; HYBRID-NEXT:    cmovnsq %r9, %rbx
+; HYBRID-NEXT:    cmovnsq %r8, %rbx
 ; HYBRID-NEXT:    cmoveq %r8, %rbx
-; HYBRID-NEXT:    movq %rbx, 16(%rax)
+; HYBRID-NEXT:    movq %rbx, 24(%rax)
+; HYBRID-NEXT:    cmovnsq %r9, %rdx
+; HYBRID-NEXT:    cmoveq %r8, %rdx
+; HYBRID-NEXT:    movq %rdx, 16(%rax)
 ; HYBRID-NEXT:    popq %rbx
 ; HYBRID-NEXT:    retq
 ;
@@ -121,14 +121,14 @@ define i256 @test1(i256 %a) nounwind {
 ; BURR-NEXT:    movl %r10d, %ecx
 ; BURR-NEXT:    shldq %cl, %rdi, %r11
 ; BURR-NEXT:    addb $-125, %sil
-; BURR-NEXT:    xorl %edx, %edx
+; BURR-NEXT:    xorl %ebx, %ebx
 ; BURR-NEXT:    movl %esi, %ecx
-; BURR-NEXT:    shldq %cl, %rdi, %rdx
-; BURR-NEXT:    movl $1, %ebx
-; BURR-NEXT:    shlq %cl, %rbx
+; BURR-NEXT:    shldq %cl, %rdi, %rbx
+; BURR-NEXT:    movl $1, %edx
+; BURR-NEXT:    shlq %cl, %rdx
 ; BURR-NEXT:    testb $64, %sil
-; BURR-NEXT:    cmovneq %rbx, %rdx
-; BURR-NEXT:    cmovneq %r8, %rbx
+; BURR-NEXT:    cmovneq %rdx, %rbx
+; BURR-NEXT:    cmovneq %r8, %rdx
 ; BURR-NEXT:    movl %r10d, %ecx
 ; BURR-NEXT:    shlq %cl, %rdi
 ; BURR-NEXT:    testb $64, %r10b
@@ -139,12 +139,12 @@ define i256 @test1(i256 %a) nounwind {
 ; BURR-NEXT:    movq %r11, 8(%rax)
 ; BURR-NEXT:    cmovsq %r8, %rdi
 ; BURR-NEXT:    movq %rdi, (%rax)
-; BURR-NEXT:    cmovnsq %r8, %rdx
-; BURR-NEXT:    cmoveq %r8, %rdx
-; BURR-NEXT:    movq %rdx, 24(%rax)
-; BURR-NEXT:    cmovnsq %r9, %rbx
+; BURR-NEXT:    cmovnsq %r8, %rbx
 ; BURR-NEXT:    cmoveq %r8, %rbx
-; BURR-NEXT:    movq %rbx, 16(%rax)
+; BURR-NEXT:    movq %rbx, 24(%rax)
+; BURR-NEXT:    cmovnsq %r9, %rdx
+; BURR-NEXT:    cmoveq %r8, %rdx
+; BURR-NEXT:    movq %rdx, 16(%rax)
 ; BURR-NEXT:    popq %rbx
 ; BURR-NEXT:    retq
 ;

diff  --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll
index b6a2e641fd65b..b9f9a489ab6ec 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix.ll
@@ -95,14 +95,14 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $14, %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %esi
-; X86-NEXT:    leal -1(%eax), %edi
-; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    idivl %edi
+; X86-NEXT:    leal -1(%eax), %esi
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    sets %bl
 ; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    sets %cl
@@ -110,9 +110,9 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    setne %dl
 ; X86-NEXT:    testb %cl, %dl
-; X86-NEXT:    cmovel %eax, %edi
-; X86-NEXT:    addl %edi, %edi
-; X86-NEXT:    movswl %di, %eax
+; X86-NEXT:    cmovel %eax, %esi
+; X86-NEXT:    addl %esi, %esi
+; X86-NEXT:    movswl %si, %eax
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    popl %esi
@@ -222,31 +222,33 @@ define i4 @func4(i4 %x, i4 %y) nounwind {
 ;
 ; X86-LABEL: func4:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    shlb $4, %cl
+; X86-NEXT:    sarb $4, %cl
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; X86-NEXT:    shlb $4, %dl
 ; X86-NEXT:    sarb $4, %dl
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT:    shlb $4, %dh
-; X86-NEXT:    sarb $4, %dh
-; X86-NEXT:    shlb $2, %dh
-; X86-NEXT:    movsbl %dh, %eax
-; X86-NEXT:    idivb %dl
-; X86-NEXT:    movsbl %ah, %ecx
+; X86-NEXT:    shlb $2, %dl
+; X86-NEXT:    movsbl %dl, %eax
+; X86-NEXT:    idivb %cl
+; X86-NEXT:    movsbl %ah, %ebx
 ; X86-NEXT:    movzbl %al, %esi
 ; X86-NEXT:    decb %al
 ; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    sets %cl
 ; X86-NEXT:    testb %dl, %dl
 ; X86-NEXT:    sets %dl
-; X86-NEXT:    testb %dh, %dh
-; X86-NEXT:    sets %dh
-; X86-NEXT:    xorb %dl, %dh
-; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    xorb %cl, %dl
+; X86-NEXT:    testb %bl, %bl
 ; X86-NEXT:    setne %cl
-; X86-NEXT:    testb %dh, %cl
+; X86-NEXT:    testb %dl, %cl
 ; X86-NEXT:    cmovel %esi, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
   %tmp = call i4 @llvm.sdiv.fix.i4(i4 %x, i4 %y, i32 2)
   ret i4 %tmp
@@ -616,26 +618,26 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    sarl $31, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    shll $31, %ebp
 ; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    shrl $31, %esi
-; X86-NEXT:    shldl $31, %ecx, %esi
+; X86-NEXT:    shll $31, %esi
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    shrl $31, %ebp
+; X86-NEXT:    shldl $31, %ecx, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __moddi3
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __divdi3
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    testl %ebp, %ebp
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    sets %dl

diff  --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index 26cbe90d8759c..192a124dfe832 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -45,14 +45,14 @@ define i16 @func(i16 %x, i16 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $8, %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %esi
-; X86-NEXT:    leal -1(%eax), %edi
-; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    idivl %edi
+; X86-NEXT:    leal -1(%eax), %esi
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    sets %bl
 ; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    sets %cl
@@ -60,10 +60,10 @@ define i16 @func(i16 %x, i16 %y) nounwind {
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    setne %dl
 ; X86-NEXT:    testb %cl, %dl
-; X86-NEXT:    cmovel %eax, %edi
-; X86-NEXT:    cmpl $65535, %edi # imm = 0xFFFF
+; X86-NEXT:    cmovel %eax, %esi
+; X86-NEXT:    cmpl $65535, %esi # imm = 0xFFFF
 ; X86-NEXT:    movl $65535, %ecx # imm = 0xFFFF
-; X86-NEXT:    cmovll %edi, %ecx
+; X86-NEXT:    cmovll %esi, %ecx
 ; X86-NEXT:    cmpl $-65535, %ecx # imm = 0xFFFF0001
 ; X86-NEXT:    movl $-65536, %eax # imm = 0xFFFF0000
 ; X86-NEXT:    cmovgel %ecx, %eax
@@ -114,14 +114,14 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $14, %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %esi
-; X86-NEXT:    leal -1(%eax), %edi
-; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    idivl %edi
+; X86-NEXT:    leal -1(%eax), %esi
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    sets %bl
 ; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    sets %cl
@@ -129,10 +129,10 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    setne %dl
 ; X86-NEXT:    testb %cl, %dl
-; X86-NEXT:    cmovel %eax, %edi
-; X86-NEXT:    cmpl $16383, %edi # imm = 0x3FFF
+; X86-NEXT:    cmovel %eax, %esi
+; X86-NEXT:    cmpl $16383, %esi # imm = 0x3FFF
 ; X86-NEXT:    movl $16383, %ecx # imm = 0x3FFF
-; X86-NEXT:    cmovll %edi, %ecx
+; X86-NEXT:    cmovll %esi, %ecx
 ; X86-NEXT:    cmpl $-16383, %ecx # imm = 0xC001
 ; X86-NEXT:    movl $-16384, %eax # imm = 0xC000
 ; X86-NEXT:    cmovgel %ecx, %eax
@@ -188,27 +188,27 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $8, %eax
-; X86-NEXT:    movswl %ax, %esi
+; X86-NEXT:    movswl %ax, %edi
 ; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    shrl $4, %esi
+; X86-NEXT:    shrl $4, %edi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    cwtd
-; X86-NEXT:    idivw %si
+; X86-NEXT:    idivw %di
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    leal -1(%eax), %edi
+; X86-NEXT:    leal -1(%eax), %esi
 ; X86-NEXT:    testw %cx, %cx
 ; X86-NEXT:    sets %cl
-; X86-NEXT:    testw %si, %si
+; X86-NEXT:    testw %di, %di
 ; X86-NEXT:    sets %ch
 ; X86-NEXT:    xorb %cl, %ch
 ; X86-NEXT:    testw %dx, %dx
 ; X86-NEXT:    setne %cl
 ; X86-NEXT:    testb %ch, %cl
-; X86-NEXT:    cmovel %eax, %edi
-; X86-NEXT:    movswl %di, %eax
+; X86-NEXT:    cmovel %eax, %esi
+; X86-NEXT:    movswl %si, %eax
 ; X86-NEXT:    cmpl $16383, %eax # imm = 0x3FFF
 ; X86-NEXT:    movl $16383, %ecx # imm = 0x3FFF
-; X86-NEXT:    cmovll %edi, %ecx
+; X86-NEXT:    cmovll %esi, %ecx
 ; X86-NEXT:    movswl %cx, %eax
 ; X86-NEXT:    cmpl $-16383, %eax # imm = 0xC001
 ; X86-NEXT:    movl $49152, %eax # imm = 0xC000
@@ -262,28 +262,29 @@ define i4 @func4(i4 %x, i4 %y) nounwind {
 ;
 ; X86-LABEL: func4:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    shlb $4, %cl
+; X86-NEXT:    sarb $4, %cl
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; X86-NEXT:    shlb $4, %dl
 ; X86-NEXT:    sarb $4, %dl
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT:    shlb $4, %dh
-; X86-NEXT:    sarb $4, %dh
-; X86-NEXT:    shlb $2, %dh
-; X86-NEXT:    movsbl %dh, %eax
-; X86-NEXT:    idivb %dl
-; X86-NEXT:    movsbl %ah, %ecx
+; X86-NEXT:    shlb $2, %dl
+; X86-NEXT:    movsbl %dl, %eax
+; X86-NEXT:    idivb %cl
+; X86-NEXT:    movsbl %ah, %ebx
 ; X86-NEXT:    movzbl %al, %esi
 ; X86-NEXT:    decb %al
 ; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    sets %cl
 ; X86-NEXT:    testb %dl, %dl
 ; X86-NEXT:    sets %dl
-; X86-NEXT:    testb %dh, %dh
-; X86-NEXT:    sets %dh
-; X86-NEXT:    xorb %dl, %dh
-; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    xorb %cl, %dl
+; X86-NEXT:    testb %bl, %bl
 ; X86-NEXT:    setne %cl
-; X86-NEXT:    testb %dh, %cl
+; X86-NEXT:    testb %dl, %cl
 ; X86-NEXT:    cmovel %esi, %eax
 ; X86-NEXT:    cmpb $7, %al
 ; X86-NEXT:    movl $7, %ecx
@@ -293,6 +294,7 @@ define i4 @func4(i4 %x, i4 %y) nounwind {
 ; X86-NEXT:    cmovgel %ecx, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
   %tmp = call i4 @llvm.sdiv.fix.sat.i4(i4 %x, i4 %y, i32 2)
   ret i4 %tmp
@@ -533,14 +535,14 @@ define i18 @func6(i16 %x, i16 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $7, %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %esi
-; X86-NEXT:    leal -1(%eax), %edi
-; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    idivl %edi
+; X86-NEXT:    leal -1(%eax), %esi
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    sets %bl
 ; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    sets %cl
@@ -548,10 +550,10 @@ define i18 @func6(i16 %x, i16 %y) nounwind {
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    setne %dl
 ; X86-NEXT:    testb %cl, %dl
-; X86-NEXT:    cmovel %eax, %edi
-; X86-NEXT:    cmpl $131071, %edi # imm = 0x1FFFF
+; X86-NEXT:    cmovel %eax, %esi
+; X86-NEXT:    cmpl $131071, %esi # imm = 0x1FFFF
 ; X86-NEXT:    movl $131071, %ecx # imm = 0x1FFFF
-; X86-NEXT:    cmovll %edi, %ecx
+; X86-NEXT:    cmovll %esi, %ecx
 ; X86-NEXT:    cmpl $-131071, %ecx # imm = 0xFFFE0001
 ; X86-NEXT:    movl $-131072, %eax # imm = 0xFFFE0000
 ; X86-NEXT:    cmovgel %ecx, %eax
@@ -983,10 +985,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    pushl 36(%ebp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    pushl %eax
@@ -997,30 +998,30 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    subl $1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    sets %al
+; X86-NEXT:    sets %cl
 ; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    sets %ah
-; X86-NEXT:    xorb %al, %ah
+; X86-NEXT:    sets %ch
+; X86-NEXT:    xorb %cl, %ch
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    setne %al
-; X86-NEXT:    testb %ah, %al
-; X86-NEXT:    cmovel %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    setne %cl
+; X86-NEXT:    testb %ch, %cl
+; X86-NEXT:    cmovel %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1034,15 +1035,15 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    subl $1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    testl %ebx, %ebx
@@ -1050,23 +1051,23 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    sets %bh
 ; X86-NEXT:    xorb %bl, %bh
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    testb %bh, %al
-; X86-NEXT:    cmovel %edx, %ecx
+; X86-NEXT:    cmovel %edi, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    cmovel %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    cmovel %esi, %eax
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill

diff  --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll
index 9fc5fd477c5e7..7b2e5a97fcd7c 100644
--- a/llvm/test/CodeGen/X86/select.ll
+++ b/llvm/test/CodeGen/X86/select.ll
@@ -295,39 +295,39 @@ define void @test6(i32 %C, <4 x float>* %A, <4 x float>* %B) nounwind {
 ;
 ; ATHLON-LABEL: test6:
 ; ATHLON:       ## %bb.0:
-; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; ATHLON-NEXT:    flds 12(%ecx)
-; ATHLON-NEXT:    flds 8(%ecx)
-; ATHLON-NEXT:    flds 4(%ecx)
-; ATHLON-NEXT:    flds (%ecx)
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; ATHLON-NEXT:    flds 12(%eax)
+; ATHLON-NEXT:    flds 8(%eax)
+; ATHLON-NEXT:    flds 4(%eax)
 ; ATHLON-NEXT:    flds (%eax)
+; ATHLON-NEXT:    flds (%ecx)
 ; ATHLON-NEXT:    fmul %st, %st(0)
 ; ATHLON-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; ATHLON-NEXT:    fxch %st(1)
 ; ATHLON-NEXT:    fcmove %st(1), %st
 ; ATHLON-NEXT:    fstp %st(1)
-; ATHLON-NEXT:    flds 4(%eax)
+; ATHLON-NEXT:    flds 4(%ecx)
 ; ATHLON-NEXT:    fmul %st, %st(0)
 ; ATHLON-NEXT:    fxch %st(2)
 ; ATHLON-NEXT:    fcmove %st(2), %st
 ; ATHLON-NEXT:    fstp %st(2)
-; ATHLON-NEXT:    flds 8(%eax)
+; ATHLON-NEXT:    flds 8(%ecx)
 ; ATHLON-NEXT:    fmul %st, %st(0)
 ; ATHLON-NEXT:    fxch %st(3)
 ; ATHLON-NEXT:    fcmove %st(3), %st
 ; ATHLON-NEXT:    fstp %st(3)
-; ATHLON-NEXT:    flds 12(%eax)
+; ATHLON-NEXT:    flds 12(%ecx)
 ; ATHLON-NEXT:    fmul %st, %st(0)
 ; ATHLON-NEXT:    fxch %st(4)
 ; ATHLON-NEXT:    fcmove %st(4), %st
 ; ATHLON-NEXT:    fstp %st(4)
 ; ATHLON-NEXT:    fxch %st(3)
-; ATHLON-NEXT:    fstps 12(%ecx)
+; ATHLON-NEXT:    fstps 12(%eax)
 ; ATHLON-NEXT:    fxch %st(1)
-; ATHLON-NEXT:    fstps 8(%ecx)
-; ATHLON-NEXT:    fstps 4(%ecx)
-; ATHLON-NEXT:    fstps (%ecx)
+; ATHLON-NEXT:    fstps 8(%eax)
+; ATHLON-NEXT:    fstps 4(%eax)
+; ATHLON-NEXT:    fstps (%eax)
 ; ATHLON-NEXT:    retl
 ;
 ; MCU-LABEL: test6:
@@ -508,43 +508,43 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
 ; ATHLON-NEXT:    pushl %edi
 ; ATHLON-NEXT:    pushl %esi
 ; ATHLON-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; ATHLON-NEXT:    cmovnel %eax, %ecx
 ; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; ATHLON-NEXT:    cmovnel %ecx, %eax
+; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    cmovnel %edx, %ecx
 ; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; ATHLON-NEXT:    cmovnel %eax, %edx
-; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %esi
-; ATHLON-NEXT:    cmovnel %eax, %esi
-; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; ATHLON-NEXT:    cmovnel %edx, %esi
+; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %edx
 ; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %edi
-; ATHLON-NEXT:    cmovnel %eax, %edi
-; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; ATHLON-NEXT:    cmovnel %edx, %edi
+; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %edx
 ; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %ebx
-; ATHLON-NEXT:    cmovnel %eax, %ebx
-; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; ATHLON-NEXT:    cmovnel %edx, %ebx
+; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %edx
 ; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %ebp
-; ATHLON-NEXT:    cmovnel %eax, %ebp
-; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; ATHLON-NEXT:    cmovnel %edx, %ebp
+; ATHLON-NEXT:    movl (%eax), %eax
 ; ATHLON-NEXT:    movl (%ecx), %ecx
-; ATHLON-NEXT:    movl (%edx), %edx
-; ATHLON-NEXT:    movl (%esi), %esi
-; ATHLON-NEXT:    movl (%edi), %edi
+; ATHLON-NEXT:    movl (%esi), %edx
+; ATHLON-NEXT:    movl (%edi), %esi
 ; ATHLON-NEXT:    movl (%ebx), %ebx
-; ATHLON-NEXT:    movl (%ebp), %ebp
+; ATHLON-NEXT:    movl (%ebp), %edi
+; ATHLON-NEXT:    decl %eax
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; ATHLON-NEXT:    movl %eax, 20(%ebp)
 ; ATHLON-NEXT:    decl %ecx
-; ATHLON-NEXT:    movl %ecx, 20(%eax)
+; ATHLON-NEXT:    movl %ecx, 16(%ebp)
 ; ATHLON-NEXT:    decl %edx
-; ATHLON-NEXT:    movl %edx, 16(%eax)
+; ATHLON-NEXT:    movl %edx, 12(%ebp)
 ; ATHLON-NEXT:    decl %esi
-; ATHLON-NEXT:    movl %esi, 12(%eax)
-; ATHLON-NEXT:    decl %edi
-; ATHLON-NEXT:    movl %edi, 8(%eax)
+; ATHLON-NEXT:    movl %esi, 8(%ebp)
 ; ATHLON-NEXT:    decl %ebx
-; ATHLON-NEXT:    movl %ebx, 4(%eax)
-; ATHLON-NEXT:    decl %ebp
-; ATHLON-NEXT:    movl %ebp, (%eax)
+; ATHLON-NEXT:    movl %ebx, 4(%ebp)
+; ATHLON-NEXT:    decl %edi
+; ATHLON-NEXT:    movl %edi, (%ebp)
 ; ATHLON-NEXT:    popl %esi
 ; ATHLON-NEXT:    popl %edi
 ; ATHLON-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll
index 8a788f41d5cc2..d0c0304b34629 100644
--- a/llvm/test/CodeGen/X86/setcc-wide-types.ll
+++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll
@@ -236,45 +236,45 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) {
 ; SSE2-LABEL: ne_i512:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm8, %rax
+; SSE2-NEXT:    movq %xmm8, %rdx
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3]
-; SSE2-NEXT:    movq %xmm8, %rcx
+; SSE2-NEXT:    movq %xmm8, %rsi
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movq %xmm8, %rdx
+; SSE2-NEXT:    movq %xmm8, %rdi
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
-; SSE2-NEXT:    movq %xmm8, %rsi
+; SSE2-NEXT:    movq %xmm8, %rax
 ; SSE2-NEXT:    movq %xmm0, %r11
-; SSE2-NEXT:    movq %xmm2, %r8
+; SSE2-NEXT:    movq %xmm2, %r10
 ; SSE2-NEXT:    movq %xmm1, %r9
-; SSE2-NEXT:    movq %xmm3, %r10
+; SSE2-NEXT:    movq %xmm3, %r8
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rdi
-; SSE2-NEXT:    xorq %rax, %rdi
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    xorq %rcx, %rax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm0, %rcx
 ; SSE2-NEXT:    xorq %rdx, %rcx
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm0, %rdx
 ; SSE2-NEXT:    xorq %rsi, %rdx
-; SSE2-NEXT:    orq %rcx, %rdx
-; SSE2-NEXT:    orq %rax, %rdx
-; SSE2-NEXT:    orq %rdi, %rdx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rsi
+; SSE2-NEXT:    xorq %rdi, %rsi
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rdi
+; SSE2-NEXT:    xorq %rax, %rdi
+; SSE2-NEXT:    orq %rsi, %rdi
+; SSE2-NEXT:    orq %rdx, %rdi
+; SSE2-NEXT:    orq %rcx, %rdi
 ; SSE2-NEXT:    movq %xmm4, %rax
 ; SSE2-NEXT:    xorq %r11, %rax
 ; SSE2-NEXT:    movq %xmm6, %rcx
-; SSE2-NEXT:    xorq %r8, %rcx
-; SSE2-NEXT:    movq %xmm5, %rsi
-; SSE2-NEXT:    xorq %r9, %rsi
-; SSE2-NEXT:    movq %xmm7, %rdi
-; SSE2-NEXT:    xorq %r10, %rdi
-; SSE2-NEXT:    orq %rsi, %rdi
-; SSE2-NEXT:    orq %rcx, %rdi
-; SSE2-NEXT:    orq %rax, %rdi
+; SSE2-NEXT:    xorq %r10, %rcx
+; SSE2-NEXT:    movq %xmm5, %rdx
+; SSE2-NEXT:    xorq %r9, %rdx
+; SSE2-NEXT:    movq %xmm7, %rsi
+; SSE2-NEXT:    xorq %r8, %rsi
+; SSE2-NEXT:    orq %rdx, %rsi
+; SSE2-NEXT:    orq %rcx, %rsi
+; SSE2-NEXT:    orq %rax, %rsi
 ; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    orq %rdx, %rdi
+; SSE2-NEXT:    orq %rdi, %rsi
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
 ;
@@ -317,84 +317,84 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) {
 ;
 ; AVX1-LABEL: ne_i512:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vmovq %xmm1, %rcx
+; AVX1-NEXT:    vmovq %xmm0, %rdx
+; AVX1-NEXT:    vmovq %xmm1, %rsi
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vmovq %xmm4, %rdx
+; AVX1-NEXT:    vmovq %xmm4, %rdi
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vmovq %xmm5, %rsi
+; AVX1-NEXT:    vmovq %xmm5, %rax
 ; AVX1-NEXT:    vpextrq $1, %xmm0, %r11
-; AVX1-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX1-NEXT:    vpextrq $1, %xmm1, %r10
 ; AVX1-NEXT:    vpextrq $1, %xmm4, %r9
-; AVX1-NEXT:    vpextrq $1, %xmm5, %r10
-; AVX1-NEXT:    vmovq %xmm2, %rdi
-; AVX1-NEXT:    xorq %rax, %rdi
-; AVX1-NEXT:    vmovq %xmm3, %rax
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
-; AVX1-NEXT:    vmovq %xmm0, %rcx
+; AVX1-NEXT:    vpextrq $1, %xmm5, %r8
+; AVX1-NEXT:    vmovq %xmm2, %rcx
 ; AVX1-NEXT:    xorq %rdx, %rcx
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm1
-; AVX1-NEXT:    vmovq %xmm1, %rdx
+; AVX1-NEXT:    vmovq %xmm3, %rdx
 ; AVX1-NEXT:    xorq %rsi, %rdx
-; AVX1-NEXT:    orq %rcx, %rdx
-; AVX1-NEXT:    orq %rax, %rdx
-; AVX1-NEXT:    orq %rdi, %rdx
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rsi
+; AVX1-NEXT:    xorq %rdi, %rsi
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm1
+; AVX1-NEXT:    vmovq %xmm1, %rdi
+; AVX1-NEXT:    xorq %rax, %rdi
+; AVX1-NEXT:    orq %rsi, %rdi
+; AVX1-NEXT:    orq %rdx, %rdi
+; AVX1-NEXT:    orq %rcx, %rdi
 ; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
 ; AVX1-NEXT:    xorq %r11, %rax
 ; AVX1-NEXT:    vpextrq $1, %xmm3, %rcx
-; AVX1-NEXT:    xorq %r8, %rcx
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX1-NEXT:    xorq %r9, %rsi
-; AVX1-NEXT:    vpextrq $1, %xmm1, %rdi
-; AVX1-NEXT:    xorq %r10, %rdi
-; AVX1-NEXT:    orq %rsi, %rdi
-; AVX1-NEXT:    orq %rcx, %rdi
-; AVX1-NEXT:    orq %rax, %rdi
+; AVX1-NEXT:    xorq %r10, %rcx
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX1-NEXT:    xorq %r9, %rdx
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX1-NEXT:    xorq %r8, %rsi
+; AVX1-NEXT:    orq %rdx, %rsi
+; AVX1-NEXT:    orq %rcx, %rsi
+; AVX1-NEXT:    orq %rax, %rsi
 ; AVX1-NEXT:    xorl %eax, %eax
-; AVX1-NEXT:    orq %rdx, %rdi
+; AVX1-NEXT:    orq %rdi, %rsi
 ; AVX1-NEXT:    setne %al
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ne_i512:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vmovq %xmm1, %rcx
+; AVX2-NEXT:    vmovq %xmm0, %rdx
+; AVX2-NEXT:    vmovq %xmm1, %rsi
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT:    vmovq %xmm4, %rdx
+; AVX2-NEXT:    vmovq %xmm4, %rdi
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX2-NEXT:    vmovq %xmm5, %rsi
+; AVX2-NEXT:    vmovq %xmm5, %rax
 ; AVX2-NEXT:    vpextrq $1, %xmm0, %r11
-; AVX2-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r10
 ; AVX2-NEXT:    vpextrq $1, %xmm4, %r9
-; AVX2-NEXT:    vpextrq $1, %xmm5, %r10
-; AVX2-NEXT:    vmovq %xmm2, %rdi
-; AVX2-NEXT:    xorq %rax, %rdi
-; AVX2-NEXT:    vmovq %xmm3, %rax
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm0
-; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    vpextrq $1, %xmm5, %r8
+; AVX2-NEXT:    vmovq %xmm2, %rcx
 ; AVX2-NEXT:    xorq %rdx, %rcx
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm1
-; AVX2-NEXT:    vmovq %xmm1, %rdx
+; AVX2-NEXT:    vmovq %xmm3, %rdx
 ; AVX2-NEXT:    xorq %rsi, %rdx
-; AVX2-NEXT:    orq %rcx, %rdx
-; AVX2-NEXT:    orq %rax, %rdx
-; AVX2-NEXT:    orq %rdi, %rdx
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rsi
+; AVX2-NEXT:    xorq %rdi, %rsi
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm1
+; AVX2-NEXT:    vmovq %xmm1, %rdi
+; AVX2-NEXT:    xorq %rax, %rdi
+; AVX2-NEXT:    orq %rsi, %rdi
+; AVX2-NEXT:    orq %rdx, %rdi
+; AVX2-NEXT:    orq %rcx, %rdi
 ; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
 ; AVX2-NEXT:    xorq %r11, %rax
 ; AVX2-NEXT:    vpextrq $1, %xmm3, %rcx
-; AVX2-NEXT:    xorq %r8, %rcx
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX2-NEXT:    xorq %r9, %rsi
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rdi
-; AVX2-NEXT:    xorq %r10, %rdi
-; AVX2-NEXT:    orq %rsi, %rdi
-; AVX2-NEXT:    orq %rcx, %rdi
-; AVX2-NEXT:    orq %rax, %rdi
+; AVX2-NEXT:    xorq %r10, %rcx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT:    xorq %r9, %rdx
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT:    xorq %r8, %rsi
+; AVX2-NEXT:    orq %rdx, %rsi
+; AVX2-NEXT:    orq %rcx, %rsi
+; AVX2-NEXT:    orq %rax, %rsi
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    orq %rdx, %rdi
+; AVX2-NEXT:    orq %rdi, %rsi
 ; AVX2-NEXT:    setne %al
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -427,45 +427,45 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) {
 ; SSE2-LABEL: eq_i512:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm8, %rax
+; SSE2-NEXT:    movq %xmm8, %rdx
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3]
-; SSE2-NEXT:    movq %xmm8, %rcx
+; SSE2-NEXT:    movq %xmm8, %rsi
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movq %xmm8, %rdx
+; SSE2-NEXT:    movq %xmm8, %rdi
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
-; SSE2-NEXT:    movq %xmm8, %rsi
+; SSE2-NEXT:    movq %xmm8, %rax
 ; SSE2-NEXT:    movq %xmm0, %r11
-; SSE2-NEXT:    movq %xmm2, %r8
+; SSE2-NEXT:    movq %xmm2, %r10
 ; SSE2-NEXT:    movq %xmm1, %r9
-; SSE2-NEXT:    movq %xmm3, %r10
+; SSE2-NEXT:    movq %xmm3, %r8
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rdi
-; SSE2-NEXT:    xorq %rax, %rdi
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    xorq %rcx, %rax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm0, %rcx
 ; SSE2-NEXT:    xorq %rdx, %rcx
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm0, %rdx
 ; SSE2-NEXT:    xorq %rsi, %rdx
-; SSE2-NEXT:    orq %rcx, %rdx
-; SSE2-NEXT:    orq %rax, %rdx
-; SSE2-NEXT:    orq %rdi, %rdx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rsi
+; SSE2-NEXT:    xorq %rdi, %rsi
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rdi
+; SSE2-NEXT:    xorq %rax, %rdi
+; SSE2-NEXT:    orq %rsi, %rdi
+; SSE2-NEXT:    orq %rdx, %rdi
+; SSE2-NEXT:    orq %rcx, %rdi
 ; SSE2-NEXT:    movq %xmm4, %rax
 ; SSE2-NEXT:    xorq %r11, %rax
 ; SSE2-NEXT:    movq %xmm6, %rcx
-; SSE2-NEXT:    xorq %r8, %rcx
-; SSE2-NEXT:    movq %xmm5, %rsi
-; SSE2-NEXT:    xorq %r9, %rsi
-; SSE2-NEXT:    movq %xmm7, %rdi
-; SSE2-NEXT:    xorq %r10, %rdi
-; SSE2-NEXT:    orq %rsi, %rdi
-; SSE2-NEXT:    orq %rcx, %rdi
-; SSE2-NEXT:    orq %rax, %rdi
+; SSE2-NEXT:    xorq %r10, %rcx
+; SSE2-NEXT:    movq %xmm5, %rdx
+; SSE2-NEXT:    xorq %r9, %rdx
+; SSE2-NEXT:    movq %xmm7, %rsi
+; SSE2-NEXT:    xorq %r8, %rsi
+; SSE2-NEXT:    orq %rdx, %rsi
+; SSE2-NEXT:    orq %rcx, %rsi
+; SSE2-NEXT:    orq %rax, %rsi
 ; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    orq %rdx, %rdi
+; SSE2-NEXT:    orq %rdi, %rsi
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
 ;
@@ -508,84 +508,84 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) {
 ;
 ; AVX1-LABEL: eq_i512:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vmovq %xmm1, %rcx
+; AVX1-NEXT:    vmovq %xmm0, %rdx
+; AVX1-NEXT:    vmovq %xmm1, %rsi
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vmovq %xmm4, %rdx
+; AVX1-NEXT:    vmovq %xmm4, %rdi
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vmovq %xmm5, %rsi
+; AVX1-NEXT:    vmovq %xmm5, %rax
 ; AVX1-NEXT:    vpextrq $1, %xmm0, %r11
-; AVX1-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX1-NEXT:    vpextrq $1, %xmm1, %r10
 ; AVX1-NEXT:    vpextrq $1, %xmm4, %r9
-; AVX1-NEXT:    vpextrq $1, %xmm5, %r10
-; AVX1-NEXT:    vmovq %xmm2, %rdi
-; AVX1-NEXT:    xorq %rax, %rdi
-; AVX1-NEXT:    vmovq %xmm3, %rax
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
-; AVX1-NEXT:    vmovq %xmm0, %rcx
+; AVX1-NEXT:    vpextrq $1, %xmm5, %r8
+; AVX1-NEXT:    vmovq %xmm2, %rcx
 ; AVX1-NEXT:    xorq %rdx, %rcx
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm1
-; AVX1-NEXT:    vmovq %xmm1, %rdx
+; AVX1-NEXT:    vmovq %xmm3, %rdx
 ; AVX1-NEXT:    xorq %rsi, %rdx
-; AVX1-NEXT:    orq %rcx, %rdx
-; AVX1-NEXT:    orq %rax, %rdx
-; AVX1-NEXT:    orq %rdi, %rdx
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rsi
+; AVX1-NEXT:    xorq %rdi, %rsi
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm1
+; AVX1-NEXT:    vmovq %xmm1, %rdi
+; AVX1-NEXT:    xorq %rax, %rdi
+; AVX1-NEXT:    orq %rsi, %rdi
+; AVX1-NEXT:    orq %rdx, %rdi
+; AVX1-NEXT:    orq %rcx, %rdi
 ; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
 ; AVX1-NEXT:    xorq %r11, %rax
 ; AVX1-NEXT:    vpextrq $1, %xmm3, %rcx
-; AVX1-NEXT:    xorq %r8, %rcx
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX1-NEXT:    xorq %r9, %rsi
-; AVX1-NEXT:    vpextrq $1, %xmm1, %rdi
-; AVX1-NEXT:    xorq %r10, %rdi
-; AVX1-NEXT:    orq %rsi, %rdi
-; AVX1-NEXT:    orq %rcx, %rdi
-; AVX1-NEXT:    orq %rax, %rdi
+; AVX1-NEXT:    xorq %r10, %rcx
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX1-NEXT:    xorq %r9, %rdx
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX1-NEXT:    xorq %r8, %rsi
+; AVX1-NEXT:    orq %rdx, %rsi
+; AVX1-NEXT:    orq %rcx, %rsi
+; AVX1-NEXT:    orq %rax, %rsi
 ; AVX1-NEXT:    xorl %eax, %eax
-; AVX1-NEXT:    orq %rdx, %rdi
+; AVX1-NEXT:    orq %rdi, %rsi
 ; AVX1-NEXT:    sete %al
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: eq_i512:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vmovq %xmm1, %rcx
+; AVX2-NEXT:    vmovq %xmm0, %rdx
+; AVX2-NEXT:    vmovq %xmm1, %rsi
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT:    vmovq %xmm4, %rdx
+; AVX2-NEXT:    vmovq %xmm4, %rdi
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX2-NEXT:    vmovq %xmm5, %rsi
+; AVX2-NEXT:    vmovq %xmm5, %rax
 ; AVX2-NEXT:    vpextrq $1, %xmm0, %r11
-; AVX2-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r10
 ; AVX2-NEXT:    vpextrq $1, %xmm4, %r9
-; AVX2-NEXT:    vpextrq $1, %xmm5, %r10
-; AVX2-NEXT:    vmovq %xmm2, %rdi
-; AVX2-NEXT:    xorq %rax, %rdi
-; AVX2-NEXT:    vmovq %xmm3, %rax
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm0
-; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    vpextrq $1, %xmm5, %r8
+; AVX2-NEXT:    vmovq %xmm2, %rcx
 ; AVX2-NEXT:    xorq %rdx, %rcx
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm1
-; AVX2-NEXT:    vmovq %xmm1, %rdx
+; AVX2-NEXT:    vmovq %xmm3, %rdx
 ; AVX2-NEXT:    xorq %rsi, %rdx
-; AVX2-NEXT:    orq %rcx, %rdx
-; AVX2-NEXT:    orq %rax, %rdx
-; AVX2-NEXT:    orq %rdi, %rdx
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rsi
+; AVX2-NEXT:    xorq %rdi, %rsi
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm1
+; AVX2-NEXT:    vmovq %xmm1, %rdi
+; AVX2-NEXT:    xorq %rax, %rdi
+; AVX2-NEXT:    orq %rsi, %rdi
+; AVX2-NEXT:    orq %rdx, %rdi
+; AVX2-NEXT:    orq %rcx, %rdi
 ; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
 ; AVX2-NEXT:    xorq %r11, %rax
 ; AVX2-NEXT:    vpextrq $1, %xmm3, %rcx
-; AVX2-NEXT:    xorq %r8, %rcx
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX2-NEXT:    xorq %r9, %rsi
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rdi
-; AVX2-NEXT:    xorq %r10, %rdi
-; AVX2-NEXT:    orq %rsi, %rdi
-; AVX2-NEXT:    orq %rcx, %rdi
-; AVX2-NEXT:    orq %rax, %rdi
+; AVX2-NEXT:    xorq %r10, %rcx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT:    xorq %r9, %rdx
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT:    xorq %r8, %rsi
+; AVX2-NEXT:    orq %rdx, %rsi
+; AVX2-NEXT:    orq %rcx, %rsi
+; AVX2-NEXT:    orq %rax, %rsi
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    orq %rdx, %rdi
+; AVX2-NEXT:    orq %rdi, %rsi
 ; AVX2-NEXT:    sete %al
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -744,22 +744,22 @@ define i32 @ne_i256_pair(i256* %a, i256* %b) {
 ; SSE2-NEXT:    xorq 24(%rsi), %r11
 ; SSE2-NEXT:    xorq (%rsi), %r8
 ; SSE2-NEXT:    xorq 16(%rsi), %r9
-; SSE2-NEXT:    movq 48(%rdi), %rdx
+; SSE2-NEXT:    movq 48(%rdi), %rcx
 ; SSE2-NEXT:    movq 32(%rdi), %rax
-; SSE2-NEXT:    movq 56(%rdi), %rcx
+; SSE2-NEXT:    movq 56(%rdi), %rdx
 ; SSE2-NEXT:    movq 40(%rdi), %rdi
 ; SSE2-NEXT:    xorq 40(%rsi), %rdi
-; SSE2-NEXT:    xorq 56(%rsi), %rcx
-; SSE2-NEXT:    orq %r11, %rcx
-; SSE2-NEXT:    orq %rdi, %rcx
-; SSE2-NEXT:    orq %r10, %rcx
+; SSE2-NEXT:    xorq 56(%rsi), %rdx
+; SSE2-NEXT:    orq %r11, %rdx
+; SSE2-NEXT:    orq %rdi, %rdx
+; SSE2-NEXT:    orq %r10, %rdx
 ; SSE2-NEXT:    xorq 32(%rsi), %rax
-; SSE2-NEXT:    xorq 48(%rsi), %rdx
-; SSE2-NEXT:    orq %r9, %rdx
-; SSE2-NEXT:    orq %rax, %rdx
-; SSE2-NEXT:    orq %r8, %rdx
+; SSE2-NEXT:    xorq 48(%rsi), %rcx
+; SSE2-NEXT:    orq %r9, %rcx
+; SSE2-NEXT:    orq %rax, %rcx
+; SSE2-NEXT:    orq %r8, %rcx
 ; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    orq %rcx, %rdx
+; SSE2-NEXT:    orq %rdx, %rcx
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
 ;
@@ -773,22 +773,22 @@ define i32 @ne_i256_pair(i256* %a, i256* %b) {
 ; SSE41-NEXT:    xorq 24(%rsi), %r11
 ; SSE41-NEXT:    xorq (%rsi), %r8
 ; SSE41-NEXT:    xorq 16(%rsi), %r9
-; SSE41-NEXT:    movq 48(%rdi), %rdx
+; SSE41-NEXT:    movq 48(%rdi), %rcx
 ; SSE41-NEXT:    movq 32(%rdi), %rax
-; SSE41-NEXT:    movq 56(%rdi), %rcx
+; SSE41-NEXT:    movq 56(%rdi), %rdx
 ; SSE41-NEXT:    movq 40(%rdi), %rdi
 ; SSE41-NEXT:    xorq 40(%rsi), %rdi
-; SSE41-NEXT:    xorq 56(%rsi), %rcx
-; SSE41-NEXT:    orq %r11, %rcx
-; SSE41-NEXT:    orq %rdi, %rcx
-; SSE41-NEXT:    orq %r10, %rcx
+; SSE41-NEXT:    xorq 56(%rsi), %rdx
+; SSE41-NEXT:    orq %r11, %rdx
+; SSE41-NEXT:    orq %rdi, %rdx
+; SSE41-NEXT:    orq %r10, %rdx
 ; SSE41-NEXT:    xorq 32(%rsi), %rax
-; SSE41-NEXT:    xorq 48(%rsi), %rdx
-; SSE41-NEXT:    orq %r9, %rdx
-; SSE41-NEXT:    orq %rax, %rdx
-; SSE41-NEXT:    orq %r8, %rdx
+; SSE41-NEXT:    xorq 48(%rsi), %rcx
+; SSE41-NEXT:    orq %r9, %rcx
+; SSE41-NEXT:    orq %rax, %rcx
+; SSE41-NEXT:    orq %r8, %rcx
 ; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    orq %rcx, %rdx
+; SSE41-NEXT:    orq %rdx, %rcx
 ; SSE41-NEXT:    setne %al
 ; SSE41-NEXT:    retq
 ;
@@ -858,22 +858,22 @@ define i32 @eq_i256_pair(i256* %a, i256* %b) {
 ; SSE2-NEXT:    xorq 24(%rsi), %r11
 ; SSE2-NEXT:    xorq (%rsi), %r8
 ; SSE2-NEXT:    xorq 16(%rsi), %r9
-; SSE2-NEXT:    movq 48(%rdi), %rdx
+; SSE2-NEXT:    movq 48(%rdi), %rcx
 ; SSE2-NEXT:    movq 32(%rdi), %rax
-; SSE2-NEXT:    movq 56(%rdi), %rcx
+; SSE2-NEXT:    movq 56(%rdi), %rdx
 ; SSE2-NEXT:    movq 40(%rdi), %rdi
 ; SSE2-NEXT:    xorq 40(%rsi), %rdi
-; SSE2-NEXT:    xorq 56(%rsi), %rcx
-; SSE2-NEXT:    orq %r11, %rcx
-; SSE2-NEXT:    orq %rdi, %rcx
-; SSE2-NEXT:    orq %r10, %rcx
+; SSE2-NEXT:    xorq 56(%rsi), %rdx
+; SSE2-NEXT:    orq %r11, %rdx
+; SSE2-NEXT:    orq %rdi, %rdx
+; SSE2-NEXT:    orq %r10, %rdx
 ; SSE2-NEXT:    xorq 32(%rsi), %rax
-; SSE2-NEXT:    xorq 48(%rsi), %rdx
-; SSE2-NEXT:    orq %r9, %rdx
-; SSE2-NEXT:    orq %rax, %rdx
-; SSE2-NEXT:    orq %r8, %rdx
+; SSE2-NEXT:    xorq 48(%rsi), %rcx
+; SSE2-NEXT:    orq %r9, %rcx
+; SSE2-NEXT:    orq %rax, %rcx
+; SSE2-NEXT:    orq %r8, %rcx
 ; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    orq %rcx, %rdx
+; SSE2-NEXT:    orq %rdx, %rcx
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
 ;
@@ -887,22 +887,22 @@ define i32 @eq_i256_pair(i256* %a, i256* %b) {
 ; SSE41-NEXT:    xorq 24(%rsi), %r11
 ; SSE41-NEXT:    xorq (%rsi), %r8
 ; SSE41-NEXT:    xorq 16(%rsi), %r9
-; SSE41-NEXT:    movq 48(%rdi), %rdx
+; SSE41-NEXT:    movq 48(%rdi), %rcx
 ; SSE41-NEXT:    movq 32(%rdi), %rax
-; SSE41-NEXT:    movq 56(%rdi), %rcx
+; SSE41-NEXT:    movq 56(%rdi), %rdx
 ; SSE41-NEXT:    movq 40(%rdi), %rdi
 ; SSE41-NEXT:    xorq 40(%rsi), %rdi
-; SSE41-NEXT:    xorq 56(%rsi), %rcx
-; SSE41-NEXT:    orq %r11, %rcx
-; SSE41-NEXT:    orq %rdi, %rcx
-; SSE41-NEXT:    orq %r10, %rcx
+; SSE41-NEXT:    xorq 56(%rsi), %rdx
+; SSE41-NEXT:    orq %r11, %rdx
+; SSE41-NEXT:    orq %rdi, %rdx
+; SSE41-NEXT:    orq %r10, %rdx
 ; SSE41-NEXT:    xorq 32(%rsi), %rax
-; SSE41-NEXT:    xorq 48(%rsi), %rdx
-; SSE41-NEXT:    orq %r9, %rdx
-; SSE41-NEXT:    orq %rax, %rdx
-; SSE41-NEXT:    orq %r8, %rdx
+; SSE41-NEXT:    xorq 48(%rsi), %rcx
+; SSE41-NEXT:    orq %r9, %rcx
+; SSE41-NEXT:    orq %rax, %rcx
+; SSE41-NEXT:    orq %r8, %rcx
 ; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    orq %rcx, %rdx
+; SSE41-NEXT:    orq %rdx, %rcx
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;

diff  --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index f09a9fc4a4be4..900f6a08076a5 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -18,20 +18,20 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-SSE-LABEL: mul_2xi8:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE-NEXT:    movl c, %esi
-; X86-SSE-NEXT:    movzwl (%edx,%ecx), %edx
+; X86-SSE-NEXT:    movzwl (%edx,%eax), %edx
 ; X86-SSE-NEXT:    movd %edx, %xmm0
-; X86-SSE-NEXT:    movzwl (%eax,%ecx), %eax
-; X86-SSE-NEXT:    movd %eax, %xmm1
+; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-SSE-NEXT:    movd %ecx, %xmm1
 ; X86-SSE-NEXT:    pxor %xmm2, %xmm2
 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
+; X86-SSE-NEXT:    movq %xmm1, (%esi,%eax,4)
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    retl
 ;
@@ -184,10 +184,10 @@ define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-SSE-LABEL: mul_8xi8:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    movl c, %ecx
 ; X86-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-SSE-NEXT:    pxor %xmm2, %xmm2
@@ -197,8 +197,8 @@ define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT:    movdqu %xmm1, 16(%esi,%ecx,4)
-; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
+; X86-SSE-NEXT:    movdqu %xmm1, 16(%ecx,%eax,4)
+; X86-SSE-NEXT:    movdqu %xmm0, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    retl
 ;
@@ -300,41 +300,41 @@ define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-SSE-LABEL: mul_16xi8:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
-; X86-SSE-NEXT:    movdqu (%edx,%ecx), %xmm0
-; X86-SSE-NEXT:    movdqu (%eax,%ecx), %xmm1
-; X86-SSE-NEXT:    pxor %xmm2, %xmm2
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movdqu (%esi,%eax), %xmm3
+; X86-SSE-NEXT:    movdqu (%edx,%eax), %xmm0
+; X86-SSE-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X86-SSE-NEXT:    pmullw %xmm4, %xmm2
+; X86-SSE-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; X86-SSE-NEXT:    pmullw %xmm3, %xmm0
 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm4
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; X86-SSE-NEXT:    pmullw %xmm3, %xmm4
-; X86-SSE-NEXT:    movdqa %xmm4, %xmm3
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT:    movdqu %xmm1, 48(%esi,%ecx,4)
-; X86-SSE-NEXT:    movdqu %xmm0, 32(%esi,%ecx,4)
-; X86-SSE-NEXT:    movdqu %xmm4, 16(%esi,%ecx,4)
-; X86-SSE-NEXT:    movdqu %xmm3, (%esi,%ecx,4)
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE-NEXT:    movdqu %xmm0, 48(%ecx,%eax,4)
+; X86-SSE-NEXT:    movdqu %xmm3, 32(%ecx,%eax,4)
+; X86-SSE-NEXT:    movdqu %xmm2, 16(%ecx,%eax,4)
+; X86-SSE-NEXT:    movdqu %xmm4, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: mul_16xi8:
 ; X86-AVX1:       # %bb.0: # %entry
 ; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT:    movl c, %esi
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-AVX1-NEXT:    movl c, %ecx
 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
@@ -347,10 +347,10 @@ define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; X86-AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
-; X86-AVX1-NEXT:    vmovdqu %xmm0, 48(%esi,%ecx,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm1, 32(%esi,%ecx,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm2, 16(%esi,%ecx,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm3, (%esi,%ecx,4)
+; X86-AVX1-NEXT:    vmovdqu %xmm0, 48(%ecx,%eax,4)
+; X86-AVX1-NEXT:    vmovdqu %xmm1, 32(%ecx,%eax,4)
+; X86-AVX1-NEXT:    vmovdqu %xmm2, 16(%ecx,%eax,4)
+; X86-AVX1-NEXT:    vmovdqu %xmm3, (%ecx,%eax,4)
 ; X86-AVX1-NEXT:    popl %esi
 ; X86-AVX1-NEXT:    retl
 ;
@@ -728,40 +728,40 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6
 ; X86-SSE-LABEL: mul_16xi16:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
-; X86-SSE-NEXT:    movdqu (%edx,%ecx), %xmm0
-; X86-SSE-NEXT:    movdqu 16(%edx,%ecx), %xmm1
-; X86-SSE-NEXT:    movdqu (%eax,%ecx), %xmm2
-; X86-SSE-NEXT:    movdqu 16(%eax,%ecx), %xmm3
-; X86-SSE-NEXT:    movdqa %xmm2, %xmm4
-; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm4
-; X86-SSE-NEXT:    pmullw %xmm0, %xmm2
-; X86-SSE-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X86-SSE-NEXT:    pmulhuw %xmm1, %xmm4
-; X86-SSE-NEXT:    pmullw %xmm1, %xmm3
-; X86-SSE-NEXT:    movdqa %xmm3, %xmm1
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X86-SSE-NEXT:    movdqu %xmm3, 32(%esi,%ecx,4)
-; X86-SSE-NEXT:    movdqu %xmm1, 48(%esi,%ecx,4)
-; X86-SSE-NEXT:    movdqu %xmm2, (%esi,%ecx,4)
-; X86-SSE-NEXT:    movdqu %xmm0, 16(%esi,%ecx,4)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movdqu (%esi,%eax), %xmm2
+; X86-SSE-NEXT:    movdqu 16(%esi,%eax), %xmm3
+; X86-SSE-NEXT:    movdqu (%edx,%eax), %xmm0
+; X86-SSE-NEXT:    movdqu 16(%edx,%eax), %xmm1
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE-NEXT:    pmulhuw %xmm2, %xmm4
+; X86-SSE-NEXT:    pmullw %xmm2, %xmm0
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; X86-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X86-SSE-NEXT:    pmulhuw %xmm3, %xmm4
+; X86-SSE-NEXT:    pmullw %xmm3, %xmm1
+; X86-SSE-NEXT:    movdqa %xmm1, %xmm3
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; X86-SSE-NEXT:    movdqu %xmm1, 32(%ecx,%eax,4)
+; X86-SSE-NEXT:    movdqu %xmm3, 48(%ecx,%eax,4)
+; X86-SSE-NEXT:    movdqu %xmm0, (%ecx,%eax,4)
+; X86-SSE-NEXT:    movdqu %xmm2, 16(%ecx,%eax,4)
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: mul_16xi16:
 ; X86-AVX1:       # %bb.0: # %entry
 ; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT:    movl c, %esi
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-AVX1-NEXT:    movl c, %ecx
 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
@@ -774,10 +774,10 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6
 ; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
-; X86-AVX1-NEXT:    vmovdqu %xmm0, 48(%esi,%ecx,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm1, 32(%esi,%ecx,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm2, 16(%esi,%ecx,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm3, (%esi,%ecx,4)
+; X86-AVX1-NEXT:    vmovdqu %xmm0, 48(%ecx,%eax,4)
+; X86-AVX1-NEXT:    vmovdqu %xmm1, 32(%ecx,%eax,4)
+; X86-AVX1-NEXT:    vmovdqu %xmm2, 16(%ecx,%eax,4)
+; X86-AVX1-NEXT:    vmovdqu %xmm3, (%ecx,%eax,4)
 ; X86-AVX1-NEXT:    popl %esi
 ; X86-AVX1-NEXT:    retl
 ;
@@ -886,14 +886,14 @@ define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b,
 ; X86-SSE-LABEL: mul_2xi8_sext:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
-; X86-SSE-NEXT:    movzwl (%edx,%ecx), %edx
-; X86-SSE-NEXT:    movd %edx, %xmm0
-; X86-SSE-NEXT:    movzwl (%eax,%ecx), %eax
-; X86-SSE-NEXT:    movd %eax, %xmm1
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movzwl (%esi,%eax), %esi
+; X86-SSE-NEXT:    movd %esi, %xmm0
+; X86-SSE-NEXT:    movzwl (%edx,%eax), %edx
+; X86-SSE-NEXT:    movd %edx, %xmm1
 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE-NEXT:    psraw $8, %xmm0
 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -901,7 +901,7 @@ define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b,
 ; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
 ; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7]
 ; X86-SSE-NEXT:    psrad $16, %xmm0
-; X86-SSE-NEXT:    movq %xmm0, (%esi,%ecx,4)
+; X86-SSE-NEXT:    movq %xmm0, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    retl
 ;
@@ -979,22 +979,22 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl
 ; X86-SSE-LABEL: mul_2xi8_sext_zext:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
-; X86-SSE-NEXT:    movzwl (%edx,%ecx), %edx
-; X86-SSE-NEXT:    movd %edx, %xmm0
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movzwl (%esi,%eax), %esi
+; X86-SSE-NEXT:    movd %esi, %xmm0
 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; X86-SSE-NEXT:    psrad $24, %xmm0
-; X86-SSE-NEXT:    movzwl (%eax,%ecx), %eax
-; X86-SSE-NEXT:    movd %eax, %xmm1
+; X86-SSE-NEXT:    movzwl (%edx,%eax), %edx
+; X86-SSE-NEXT:    movd %edx, %xmm1
 ; X86-SSE-NEXT:    pxor %xmm2, %xmm2
 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; X86-SSE-NEXT:    pmaddwd %xmm0, %xmm1
-; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
+; X86-SSE-NEXT:    movq %xmm1, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    retl
 ;
@@ -1151,10 +1151,10 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon
 ; X86-SSE-LABEL: mul_2xi16_sext_zext:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    movl c, %ecx
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
 ; X86-SSE-NEXT:    psrad $16, %xmm0
@@ -1166,7 +1166,7 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; X86-SSE-NEXT:    pmuludq %xmm2, %xmm0
 ; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
+; X86-SSE-NEXT:    movq %xmm1, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    retl
 ;
@@ -1240,56 +1240,56 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %
 ; X86-SSE-LABEL: mul_16xi16_sext:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
-; X86-SSE-NEXT:    movdqu (%edx,%ecx), %xmm0
-; X86-SSE-NEXT:    movdqu 16(%edx,%ecx), %xmm1
-; X86-SSE-NEXT:    movdqu (%eax,%ecx), %xmm2
-; X86-SSE-NEXT:    movdqu 16(%eax,%ecx), %xmm3
-; X86-SSE-NEXT:    movdqa %xmm2, %xmm4
-; X86-SSE-NEXT:    pmulhw %xmm0, %xmm4
-; X86-SSE-NEXT:    pmullw %xmm0, %xmm2
-; X86-SSE-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X86-SSE-NEXT:    pmulhw %xmm1, %xmm4
-; X86-SSE-NEXT:    pmullw %xmm1, %xmm3
-; X86-SSE-NEXT:    movdqa %xmm3, %xmm1
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X86-SSE-NEXT:    movdqu %xmm3, 32(%esi,%ecx,4)
-; X86-SSE-NEXT:    movdqu %xmm1, 48(%esi,%ecx,4)
-; X86-SSE-NEXT:    movdqu %xmm2, (%esi,%ecx,4)
-; X86-SSE-NEXT:    movdqu %xmm0, 16(%esi,%ecx,4)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movdqu (%esi,%eax), %xmm2
+; X86-SSE-NEXT:    movdqu 16(%esi,%eax), %xmm3
+; X86-SSE-NEXT:    movdqu (%edx,%eax), %xmm0
+; X86-SSE-NEXT:    movdqu 16(%edx,%eax), %xmm1
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE-NEXT:    pmulhw %xmm2, %xmm4
+; X86-SSE-NEXT:    pmullw %xmm2, %xmm0
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; X86-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X86-SSE-NEXT:    pmulhw %xmm3, %xmm4
+; X86-SSE-NEXT:    pmullw %xmm3, %xmm1
+; X86-SSE-NEXT:    movdqa %xmm1, %xmm3
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; X86-SSE-NEXT:    movdqu %xmm1, 32(%ecx,%eax,4)
+; X86-SSE-NEXT:    movdqu %xmm3, 48(%ecx,%eax,4)
+; X86-SSE-NEXT:    movdqu %xmm0, (%ecx,%eax,4)
+; X86-SSE-NEXT:    movdqu %xmm2, 16(%ecx,%eax,4)
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: mul_16xi16_sext:
 ; X86-AVX1:       # %bb.0: # %entry
 ; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT:    movl c, %esi
-; X86-AVX1-NEXT:    vpmovsxwd 24(%edx,%ecx), %xmm0
-; X86-AVX1-NEXT:    vpmovsxwd 16(%edx,%ecx), %xmm1
-; X86-AVX1-NEXT:    vpmovsxwd 8(%edx,%ecx), %xmm2
-; X86-AVX1-NEXT:    vpmovsxwd (%edx,%ecx), %xmm3
-; X86-AVX1-NEXT:    vpmovsxwd 24(%eax,%ecx), %xmm4
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-AVX1-NEXT:    movl c, %ecx
+; X86-AVX1-NEXT:    vpmovsxwd 24(%esi,%eax), %xmm0
+; X86-AVX1-NEXT:    vpmovsxwd 16(%esi,%eax), %xmm1
+; X86-AVX1-NEXT:    vpmovsxwd 8(%esi,%eax), %xmm2
+; X86-AVX1-NEXT:    vpmovsxwd (%esi,%eax), %xmm3
+; X86-AVX1-NEXT:    vpmovsxwd 24(%edx,%eax), %xmm4
 ; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
-; X86-AVX1-NEXT:    vpmovsxwd 16(%eax,%ecx), %xmm4
+; X86-AVX1-NEXT:    vpmovsxwd 16(%edx,%eax), %xmm4
 ; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
-; X86-AVX1-NEXT:    vpmovsxwd 8(%eax,%ecx), %xmm4
+; X86-AVX1-NEXT:    vpmovsxwd 8(%edx,%eax), %xmm4
 ; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
-; X86-AVX1-NEXT:    vpmovsxwd (%eax,%ecx), %xmm4
+; X86-AVX1-NEXT:    vpmovsxwd (%edx,%eax), %xmm4
 ; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
-; X86-AVX1-NEXT:    vmovdqu %xmm0, 48(%esi,%ecx,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm1, 32(%esi,%ecx,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm2, 16(%esi,%ecx,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm3, (%esi,%ecx,4)
+; X86-AVX1-NEXT:    vmovdqu %xmm0, 48(%ecx,%eax,4)
+; X86-AVX1-NEXT:    vmovdqu %xmm1, 32(%ecx,%eax,4)
+; X86-AVX1-NEXT:    vmovdqu %xmm2, 16(%ecx,%eax,4)
+; X86-AVX1-NEXT:    vmovdqu %xmm3, (%ecx,%eax,4)
 ; X86-AVX1-NEXT:    popl %esi
 ; X86-AVX1-NEXT:    retl
 ;

diff  --git a/llvm/test/CodeGen/X86/smax.ll b/llvm/test/CodeGen/X86/smax.ll
index 31d0822f8090a..1cc525d6ad9bd 100644
--- a/llvm/test/CodeGen/X86/smax.ll
+++ b/llvm/test/CodeGen/X86/smax.ll
@@ -158,50 +158,50 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %ebx, %edx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    cmoval %edx, %eax
-; X86-NEXT:    cmpl %esi, %ecx
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    cmoval %edx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    cmoval %esi, %eax
+; X86-NEXT:    cmpl %edx, %edi
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    cmoval %esi, %ebp
 ; X86-NEXT:    cmovel %eax, %ebp
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    cmoval %ecx, %eax
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    cmoval %edi, %eax
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %edi, %ecx
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ebx, %edi
 ; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    cmovel %ebp, %ebx
-; X86-NEXT:    cmovel (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl %eax, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    cmoval %edi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    xorl %esi, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    cmovel %ebp, %ecx
+; X86-NEXT:    cmovel (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    cmpl %esi, %ebx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    cmoval %ebx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl %edx, %ebp
-; X86-NEXT:    cmovgl %edi, %eax
-; X86-NEXT:    cmovel %ecx, %eax
-; X86-NEXT:    cmovgl %ebp, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %edx, 12(%ecx)
-; X86-NEXT:    movl %eax, 8(%ecx)
-; X86-NEXT:    movl %esi, 4(%ecx)
-; X86-NEXT:    movl %ebx, (%ecx)
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    cmpl %eax, %ebp
+; X86-NEXT:    cmovgl %ebx, %esi
+; X86-NEXT:    cmovel %edi, %esi
+; X86-NEXT:    cmovgl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %eax, 12(%edi)
+; X86-NEXT:    movl %esi, 8(%edi)
+; X86-NEXT:    movl %edx, 4(%edi)
+; X86-NEXT:    movl %ecx, (%edi)
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    addl $4, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi

diff  --git a/llvm/test/CodeGen/X86/smin.ll b/llvm/test/CodeGen/X86/smin.ll
index 70391534f544c..bbaf0c3054a89 100644
--- a/llvm/test/CodeGen/X86/smin.ll
+++ b/llvm/test/CodeGen/X86/smin.ll
@@ -161,47 +161,46 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpl %ecx, %edi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    cmovbl %edi, %eax
+; X86-NEXT:    cmpl %esi, %ebp
 ; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    cmovbl %eax, %ebx
-; X86-NEXT:    cmpl %esi, %edi
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    cmovbl %eax, %ebp
-; X86-NEXT:    cmovel %ebx, %ebp
+; X86-NEXT:    cmovbl %edi, %ebx
+; X86-NEXT:    cmovel %eax, %ebx
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    cmovbl %edi, %eax
+; X86-NEXT:    cmovbl %ebp, %eax
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    cmpl %edx, %edi
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    cmovbl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    sbbl %edi, %ebp
 ; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    cmovel %ebp, %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    xorl %edi, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    orl %ebp, %eax
+; X86-NEXT:    cmovel %ebx, %ecx
 ; X86-NEXT:    cmovel (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %edi, %eax
 ; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %edi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %eax, 12(%edi)
-; X86-NEXT:    movl %edx, 8(%edi)
-; X86-NEXT:    movl %esi, 4(%edi)
-; X86-NEXT:    movl %ecx, (%edi)
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    cmovll %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi

diff  --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll
index c34730b1125a5..45d089d28f1d6 100644
--- a/llvm/test/CodeGen/X86/smul_fix.ll
+++ b/llvm/test/CodeGen/X86/smul_fix.ll
@@ -51,36 +51,36 @@ define i64 @func2(i64 %x, i64 %y) {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    movl %edi, %ebp
 ; X86-NEXT:    subl %ecx, %ebp
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    cmovnsl %esi, %ebp
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    cmovnsl %edi, %ebp
 ; X86-NEXT:    movl %ebp, %edx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    cmovnsl %ebp, %edx
 ; X86-NEXT:    shldl $30, %eax, %edx
-; X86-NEXT:    shldl $30, %ebx, %eax
+; X86-NEXT:    shldl $30, %esi, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    popl %edi
@@ -165,28 +165,28 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    shldl $30, %eax, %ebp
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shldl $30, %eax, %esi
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    shldl $30, %eax, %ebx
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    shldl $30, %eax, %edi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    shldl $30, %eax, %ebp
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
 ; X86-NEXT:    shldl $30, %eax, %edx
 ; X86-NEXT:    movl %edx, 12(%ecx)
-; X86-NEXT:    movl %edi, 8(%ecx)
+; X86-NEXT:    movl %ebp, 8(%ecx)
 ; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl %ebp, (%ecx)
+; X86-NEXT:    movl %esi, (%ecx)
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -318,26 +318,26 @@ define i64 @func7(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %edx, %ebx
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %ebp, %edi
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    subl %ebp, %ebx
-; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    subl %esi, %ebx
+; X86-NEXT:    testl %ebp, %ebp
 ; X86-NEXT:    cmovnsl %edi, %ebx
 ; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    subl %ecx, %edx
@@ -368,31 +368,32 @@ define i64 @func8(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    imull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull %edi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    subl %esi, %ecx
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    sbbl $0, %esi
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovnsl %ebp, %esi
+; X86-NEXT:    cmovnsl %edi, %esi
 ; X86-NEXT:    cmovnsl %edx, %ecx
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi

diff  --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll
index 74123e090f21f..44306c8f960de 100644
--- a/llvm/test/CodeGen/X86/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll
@@ -60,69 +60,75 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    imull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull %ebx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl %ebx, %edx
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    adcl %ebp, %edx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    sbbl $0, %ebx
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    sbbl $0, %ebp
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovnsl %esi, %ebx
+; X86-NEXT:    cmovnsl %ebx, %ebp
 ; X86-NEXT:    cmovnsl %edx, %edi
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ebp, %edx
+; X86-NEXT:    sbbl $0, %edx
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovnsl %ebx, %esi
-; X86-NEXT:    cmovnsl %edi, %ebp
-; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    cmovnsl %ebp, %edx
+; X86-NEXT:    cmovnsl %edi, %ecx
+; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    setg %bl
 ; X86-NEXT:    sete %bh
-; X86-NEXT:    cmpl $2, %ebp
-; X86-NEXT:    setae %dl
-; X86-NEXT:    andb %bh, %dl
-; X86-NEXT:    orb %bl, %dl
-; X86-NEXT:    shrdl $2, %eax, %ecx
-; X86-NEXT:    shrdl $2, %ebp, %eax
-; X86-NEXT:    testb %dl, %dl
+; X86-NEXT:    cmpl $2, %ecx
+; X86-NEXT:    setae %al
+; X86-NEXT:    andb %bh, %al
+; X86-NEXT:    orb %bl, %al
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    shrdl $2, %esi, %ebx
+; X86-NEXT:    shrdl $2, %ecx, %esi
+; X86-NEXT:    testb %al, %al
 ; X86-NEXT:    movl $2147483647, %edi # imm = 0x7FFFFFFF
-; X86-NEXT:    cmovel %eax, %edi
+; X86-NEXT:    cmovel %esi, %edi
 ; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    cmovnel %eax, %ecx
-; X86-NEXT:    cmpl $-1, %esi
-; X86-NEXT:    setl %al
+; X86-NEXT:    cmovnel %eax, %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    cmpl $-1, %edx
+; X86-NEXT:    setl %bl
 ; X86-NEXT:    sete %dl
-; X86-NEXT:    cmpl $-2, %ebp
-; X86-NEXT:    setb %ah
-; X86-NEXT:    andb %dl, %ah
+; X86-NEXT:    cmpl $-2, %ecx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    andb %dl, %cl
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    orb %al, %ah
-; X86-NEXT:    cmovnel %edx, %ecx
+; X86-NEXT:    orb %bl, %cl
+; X86-NEXT:    cmovnel %edx, %eax
 ; X86-NEXT:    movl $-2147483648, %edx # imm = 0x80000000
 ; X86-NEXT:    cmovel %edi, %edx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl $4, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -403,16 +409,16 @@ define i64 @func5(i64 %x, i64 %y) {
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    adcl %ebx, %ebp
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -474,19 +480,19 @@ define i4 @func6(i4 %x, i4 %y) nounwind {
 ;
 ; X86-LABEL: func6:
 ; X86:       # %bb.0:
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    shlb $4, %dl
-; X86-NEXT:    sarb $4, %dl
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    shlb $4, %cl
+; X86-NEXT:    sarb $4, %cl
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    shlb $4, %al
-; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movb %al, %ah
-; X86-NEXT:    xorb %dl, %ah
-; X86-NEXT:    sets %cl
-; X86-NEXT:    addl $127, %ecx
-; X86-NEXT:    imulb %dl
+; X86-NEXT:    xorb %cl, %ah
+; X86-NEXT:    sets %dl
+; X86-NEXT:    addl $127, %edx
+; X86-NEXT:    imulb %cl
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    cmovol %ecx, %eax
+; X86-NEXT:    cmovol %edx, %eax
 ; X86-NEXT:    sarb $4, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
@@ -555,44 +561,44 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    xorl %edi, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sets %bl
-; X86-NEXT:    addl $2147483647, %ebx # imm = 0x7FFFFFFF
+; X86-NEXT:    sets %al
+; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
 ; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    cmovol %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    cmovol %eax, %ecx
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    xorl %ebp, %edi
+; X86-NEXT:    xorl %ebx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sets %bl
-; X86-NEXT:    addl $2147483647, %ebx # imm = 0x7FFFFFFF
-; X86-NEXT:    imull %ebp, %edx
-; X86-NEXT:    cmovol %ebx, %edx
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    xorl %esi, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sets %bl
-; X86-NEXT:    addl $2147483647, %ebx # imm = 0x7FFFFFFF
+; X86-NEXT:    sets %al
+; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    imull %ebx, %edx
+; X86-NEXT:    cmovol %eax, %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    xorl %esi, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    sets %al
+; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
 ; X86-NEXT:    imull %esi, %edi
-; X86-NEXT:    cmovol %ebx, %edi
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    sets %bl
-; X86-NEXT:    addl $2147483647, %ebx # imm = 0x7FFFFFFF
-; X86-NEXT:    imull %eax, %ebp
-; X86-NEXT:    cmovol %ebx, %ebp
+; X86-NEXT:    cmovol %eax, %edi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    xorl %ebp, %esi
+; X86-NEXT:    sets %al
+; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    imull %ebp, %ebx
+; X86-NEXT:    cmovol %eax, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebp, 12(%eax)
+; X86-NEXT:    movl %ebx, 12(%eax)
 ; X86-NEXT:    movl %edi, 8(%eax)
 ; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
@@ -625,62 +631,65 @@ define i64 @func7(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    imull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull %edi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    subl %esi, %ebx
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    sbbl $0, %esi
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovnsl %ebp, %esi
-; X86-NEXT:    cmovnsl %edx, %ecx
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    cmovnsl %edi, %esi
+; X86-NEXT:    cmovnsl %edx, %ebx
+; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    sbbl $0, %ecx
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovnsl %esi, %edi
-; X86-NEXT:    cmovnsl %ecx, %edx
+; X86-NEXT:    cmovnsl %esi, %ecx
+; X86-NEXT:    cmovnsl %ebx, %edx
 ; X86-NEXT:    testl %edx, %edx
-; X86-NEXT:    setns %cl
-; X86-NEXT:    sets %ch
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    setg %bl
-; X86-NEXT:    sete %bh
-; X86-NEXT:    andb %ch, %bh
-; X86-NEXT:    orb %bl, %bh
+; X86-NEXT:    setns {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    sets %bh
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    setg {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    sete %bl
+; X86-NEXT:    andb %bh, %bl
+; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
 ; X86-NEXT:    movl $2147483647, %esi # imm = 0x7FFFFFFF
 ; X86-NEXT:    cmovnel %esi, %edx
 ; X86-NEXT:    movl $-1, %esi
 ; X86-NEXT:    cmovnel %esi, %eax
-; X86-NEXT:    cmpl $-1, %edi
-; X86-NEXT:    setl %ch
-; X86-NEXT:    sete %bl
-; X86-NEXT:    andb %cl, %bl
+; X86-NEXT:    cmpl $-1, %ecx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    sete %ch
+; X86-NEXT:    andb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Folded Reload
 ; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    orb %ch, %bl
+; X86-NEXT:    orb %cl, %ch
 ; X86-NEXT:    cmovnel %esi, %eax
 ; X86-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
 ; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    addl $4, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -714,49 +723,50 @@ define i64 @func8(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    imull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull %edi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    subl %esi, %ebx
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    sbbl $0, %esi
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovnsl %ebp, %esi
-; X86-NEXT:    cmovnsl %edx, %ecx
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    cmovnsl %edi, %esi
+; X86-NEXT:    cmovnsl %edx, %ebx
+; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    sbbl $0, %ecx
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovnsl %esi, %edi
-; X86-NEXT:    cmovnsl %ecx, %edx
+; X86-NEXT:    cmovnsl %esi, %ecx
+; X86-NEXT:    cmovnsl %ebx, %edx
 ; X86-NEXT:    shrdl $31, %edx, %eax
-; X86-NEXT:    shrdl $31, %edi, %edx
-; X86-NEXT:    cmpl $1073741824, %edi # imm = 0x40000000
-; X86-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X86-NEXT:    cmovgel %ecx, %edx
-; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    cmovgel %ecx, %eax
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    cmpl $-1073741824, %edi # imm = 0xC0000000
-; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    shrdl $31, %ecx, %edx
+; X86-NEXT:    cmpl $1073741824, %ecx # imm = 0x40000000
+; X86-NEXT:    movl $2147483647, %esi # imm = 0x7FFFFFFF
+; X86-NEXT:    cmovgel %esi, %edx
+; X86-NEXT:    movl $-1, %esi
+; X86-NEXT:    cmovgel %esi, %eax
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    cmpl $-1073741824, %ecx # imm = 0xC0000000
+; X86-NEXT:    cmovll %esi, %eax
 ; X86-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
 ; X86-NEXT:    cmovll %ecx, %edx
 ; X86-NEXT:    popl %esi

diff  --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index 7e90650b500d3..f3aa44f59ba54 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -53,23 +53,23 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, i128* %res) {
 ; X64-NEXT:    adcq $0, %rbx
 ; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %rsi, %r15
-; X64-NEXT:    adcq %rbx, %r14
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %rsi, %r14
+; X64-NEXT:    adcq %rbx, %r15
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    movzbl %al, %esi
 ; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    addq %r14, %rax
+; X64-NEXT:    addq %r15, %rax
 ; X64-NEXT:    adcq %rsi, %rdx
 ; X64-NEXT:    addq %r11, %rax
 ; X64-NEXT:    adcq %rdi, %rdx
-; X64-NEXT:    movq %r15, 8(%r8)
-; X64-NEXT:    sarq $63, %r15
-; X64-NEXT:    xorq %r15, %rdx
-; X64-NEXT:    xorq %rax, %r15
-; X64-NEXT:    orq %rdx, %r15
+; X64-NEXT:    movq %r14, 8(%r8)
+; X64-NEXT:    sarq $63, %r14
+; X64-NEXT:    xorq %r14, %rdx
+; X64-NEXT:    xorq %rax, %r14
+; X64-NEXT:    orq %rdx, %r14
 ; X64-NEXT:    setne %al
 ; X64-NEXT:    movq %r9, (%r8)
 ; X64-NEXT:    popq %rbx
@@ -225,30 +225,30 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, i128* %res) {
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    imull %ebx, %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    imull %ebx, %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    addl %edi, %edx
 ; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    imull %edi, %ebp
+; X86-NEXT:    imull %ecx, %ebp
 ; X86-NEXT:    addl %edx, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
@@ -258,6 +258,7 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, i128* %res) {
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
@@ -511,28 +512,28 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, i256* %res) {
 ; X64-NEXT:    imulq %rsi, %rcx
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %rcx, %rdx
 ; X64-NEXT:    imulq %rdi, %r8
 ; X64-NEXT:    addq %rdx, %r8
-; X64-NEXT:    movq %rdi, %rbp
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Reload
-; X64-NEXT:    imulq %r10, %rbp
+; X64-NEXT:    movq %rdi, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 ## 8-byte Reload
+; X64-NEXT:    imulq %r9, %rcx
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Reload
-; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp ## 8-byte Reload
+; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %rbp, %rdx
+; X64-NEXT:    addq %rcx, %rdx
 ; X64-NEXT:    movq %rdi, %rcx
-; X64-NEXT:    imulq %rbx, %rcx
-; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    imulq %rbp, %rcx
+; X64-NEXT:    movq %rbp, %rax
 ; X64-NEXT:    addq %rdx, %rcx
-; X64-NEXT:    addq %r9, %r13
+; X64-NEXT:    addq %rbx, %r13
 ; X64-NEXT:    adcq %r8, %rcx
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    addq %rbx, %r8
@@ -1302,24 +1303,24 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, i256* %res) {
 ; X86-NEXT:    adcl %edx, %edi
 ; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %ebx, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    imull %ebx, %ecx
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %ebx, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull %ebx, %eax
-; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull %ebx, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull %ebx, %ecx
+; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    movl (%esp), %edx ## 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %esi
@@ -1462,16 +1463,15 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, i256* %res) {
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %edi, %eax
 ; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    imull %ebp, %esi
-; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    imull %ebp, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    imull %ebp, %edi
+; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    movl %ebp, %ecx
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ebp, %eax

diff  --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index a8d9b6bf6ec27..9e4902830a34d 100644
--- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -3136,34 +3136,34 @@ define void @test_MM_TRANSPOSE4_PS(<4 x float>* %a0, <4 x float>* %a1, <4 x floa
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
-; X86-SSE-NEXT:    movaps (%esi), %xmm0 # encoding: [0x0f,0x28,0x06]
-; X86-SSE-NEXT:    movaps (%edx), %xmm1 # encoding: [0x0f,0x28,0x0a]
-; X86-SSE-NEXT:    movaps (%ecx), %xmm2 # encoding: [0x0f,0x28,0x11]
+; X86-SSE-NEXT:    movaps (%esi), %xmm1 # encoding: [0x0f,0x28,0x0e]
+; X86-SSE-NEXT:    movaps (%edx), %xmm2 # encoding: [0x0f,0x28,0x12]
+; X86-SSE-NEXT:    movaps (%ecx), %xmm0 # encoding: [0x0f,0x28,0x01]
 ; X86-SSE-NEXT:    movaps (%eax), %xmm3 # encoding: [0x0f,0x28,0x18]
-; X86-SSE-NEXT:    movaps %xmm0, %xmm4 # encoding: [0x0f,0x28,0xe0]
-; X86-SSE-NEXT:    unpcklps %xmm1, %xmm4 # encoding: [0x0f,0x14,0xe1]
-; X86-SSE-NEXT:    # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; X86-SSE-NEXT:    movaps %xmm2, %xmm5 # encoding: [0x0f,0x28,0xea]
+; X86-SSE-NEXT:    movaps %xmm1, %xmm4 # encoding: [0x0f,0x28,0xe1]
+; X86-SSE-NEXT:    unpcklps %xmm2, %xmm4 # encoding: [0x0f,0x14,0xe2]
+; X86-SSE-NEXT:    # xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; X86-SSE-NEXT:    movaps %xmm0, %xmm5 # encoding: [0x0f,0x28,0xe8]
 ; X86-SSE-NEXT:    unpcklps %xmm3, %xmm5 # encoding: [0x0f,0x14,0xeb]
 ; X86-SSE-NEXT:    # xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; X86-SSE-NEXT:    unpckhps %xmm1, %xmm0 # encoding: [0x0f,0x15,0xc1]
-; X86-SSE-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-SSE-NEXT:    unpckhps %xmm3, %xmm2 # encoding: [0x0f,0x15,0xd3]
-; X86-SSE-NEXT:    # xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; X86-SSE-NEXT:    movaps %xmm4, %xmm1 # encoding: [0x0f,0x28,0xcc]
-; X86-SSE-NEXT:    movlhps %xmm5, %xmm1 # encoding: [0x0f,0x16,0xcd]
-; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm5[0]
+; X86-SSE-NEXT:    unpckhps %xmm2, %xmm1 # encoding: [0x0f,0x15,0xca]
+; X86-SSE-NEXT:    # xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-SSE-NEXT:    unpckhps %xmm3, %xmm0 # encoding: [0x0f,0x15,0xc3]
+; X86-SSE-NEXT:    # xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; X86-SSE-NEXT:    movaps %xmm4, %xmm2 # encoding: [0x0f,0x28,0xd4]
+; X86-SSE-NEXT:    movlhps %xmm5, %xmm2 # encoding: [0x0f,0x16,0xd5]
+; X86-SSE-NEXT:    # xmm2 = xmm2[0],xmm5[0]
 ; X86-SSE-NEXT:    movhlps %xmm4, %xmm5 # encoding: [0x0f,0x12,0xec]
 ; X86-SSE-NEXT:    # xmm5 = xmm4[1],xmm5[1]
-; X86-SSE-NEXT:    movaps %xmm0, %xmm3 # encoding: [0x0f,0x28,0xd8]
-; X86-SSE-NEXT:    movlhps %xmm2, %xmm3 # encoding: [0x0f,0x16,0xda]
-; X86-SSE-NEXT:    # xmm3 = xmm3[0],xmm2[0]
-; X86-SSE-NEXT:    movhlps %xmm0, %xmm2 # encoding: [0x0f,0x12,0xd0]
-; X86-SSE-NEXT:    # xmm2 = xmm0[1],xmm2[1]
-; X86-SSE-NEXT:    movaps %xmm1, (%esi) # encoding: [0x0f,0x29,0x0e]
+; X86-SSE-NEXT:    movaps %xmm1, %xmm3 # encoding: [0x0f,0x28,0xd9]
+; X86-SSE-NEXT:    movlhps %xmm0, %xmm3 # encoding: [0x0f,0x16,0xd8]
+; X86-SSE-NEXT:    # xmm3 = xmm3[0],xmm0[0]
+; X86-SSE-NEXT:    movhlps %xmm1, %xmm0 # encoding: [0x0f,0x12,0xc1]
+; X86-SSE-NEXT:    # xmm0 = xmm1[1],xmm0[1]
+; X86-SSE-NEXT:    movaps %xmm2, (%esi) # encoding: [0x0f,0x29,0x16]
 ; X86-SSE-NEXT:    movaps %xmm5, (%edx) # encoding: [0x0f,0x29,0x2a]
 ; X86-SSE-NEXT:    movaps %xmm3, (%ecx) # encoding: [0x0f,0x29,0x19]
-; X86-SSE-NEXT:    movaps %xmm2, (%eax) # encoding: [0x0f,0x29,0x10]
+; X86-SSE-NEXT:    movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00]
 ; X86-SSE-NEXT:    popl %esi # encoding: [0x5e]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;

diff  --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index eaed72299bce2..76463190ee0cf 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -3343,59 +3343,59 @@ define <2 x i64> @test_mm_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a
 ; X86-SSE-NEXT:    punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8]
 ; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
-; X86-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x10]
 ; X86-SSE-NEXT:    movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0]
-; X86-SSE-NEXT:    punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0]
-; X86-SSE-NEXT:    # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X86-SSE-NEXT:    punpcklwd %xmm1, %xmm2 # encoding: [0x66,0x0f,0x61,0xd1]
-; X86-SSE-NEXT:    # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x14]
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x10]
 ; X86-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X86-SSE-NEXT:    punpcklbw %xmm2, %xmm0 # encoding: [0x66,0x0f,0x60,0xc2]
+; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X86-SSE-NEXT:    punpcklwd %xmm1, %xmm0 # encoding: [0x66,0x0f,0x61,0xc1]
+; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x14]
+; X86-SSE-NEXT:    movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8]
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x18]
-; X86-SSE-NEXT:    movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8]
-; X86-SSE-NEXT:    punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8]
-; X86-SSE-NEXT:    # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X86-SSE-NEXT:    movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0]
+; X86-SSE-NEXT:    punpcklbw %xmm1, %xmm2 # encoding: [0x66,0x0f,0x60,0xd1]
+; X86-SSE-NEXT:    # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x1c]
-; X86-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X86-SSE-NEXT:    movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8]
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x20]
 ; X86-SSE-NEXT:    movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8]
-; X86-SSE-NEXT:    punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8]
-; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X86-SSE-NEXT:    punpcklwd %xmm3, %xmm1 # encoding: [0x66,0x0f,0x61,0xcb]
-; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; X86-SSE-NEXT:    punpckldq %xmm2, %xmm1 # encoding: [0x66,0x0f,0x62,0xca]
-; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE-NEXT:    punpcklbw %xmm3, %xmm1 # encoding: [0x66,0x0f,0x60,0xcb]
+; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; X86-SSE-NEXT:    punpcklwd %xmm2, %xmm1 # encoding: [0x66,0x0f,0x61,0xca]
+; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-SSE-NEXT:    punpckldq %xmm0, %xmm1 # encoding: [0x66,0x0f,0x62,0xc8]
+; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x24]
 ; X86-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x28]
-; X86-SSE-NEXT:    movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0]
-; X86-SSE-NEXT:    punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0]
-; X86-SSE-NEXT:    # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x2c]
-; X86-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x30]
 ; X86-SSE-NEXT:    movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8]
 ; X86-SSE-NEXT:    punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8]
 ; X86-SSE-NEXT:    # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; X86-SSE-NEXT:    punpcklwd %xmm2, %xmm3 # encoding: [0x66,0x0f,0x61,0xda]
-; X86-SSE-NEXT:    # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x34]
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x2c]
 ; X86-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x38]
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x30]
 ; X86-SSE-NEXT:    movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0]
 ; X86-SSE-NEXT:    punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0]
 ; X86-SSE-NEXT:    # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X86-SSE-NEXT:    punpcklwd %xmm3, %xmm2 # encoding: [0x66,0x0f,0x61,0xd3]
+; X86-SSE-NEXT:    # xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x34]
+; X86-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x38]
+; X86-SSE-NEXT:    movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8]
+; X86-SSE-NEXT:    punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8]
+; X86-SSE-NEXT:    # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x3c]
 ; X86-SSE-NEXT:    movd %eax, %xmm4 # encoding: [0x66,0x0f,0x6e,0xe0]
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x40]
 ; X86-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
 ; X86-SSE-NEXT:    punpcklbw %xmm4, %xmm0 # encoding: [0x66,0x0f,0x60,0xc4]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; X86-SSE-NEXT:    punpcklwd %xmm2, %xmm0 # encoding: [0x66,0x0f,0x61,0xc2]
-; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT:    punpckldq %xmm3, %xmm0 # encoding: [0x66,0x0f,0x62,0xc3]
-; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X86-SSE-NEXT:    punpcklwd %xmm3, %xmm0 # encoding: [0x66,0x0f,0x61,0xc3]
+; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; X86-SSE-NEXT:    punpckldq %xmm2, %xmm0 # encoding: [0x66,0x0f,0x62,0xc2]
+; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X86-SSE-NEXT:    punpcklqdq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc1]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
@@ -3775,9 +3775,9 @@ define <2 x i64> @test_mm_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4,
 ; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x08]
 ; X86-SSE-NEXT:    movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0]
 ; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x0c]
-; X86-SSE-NEXT:    movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8]
-; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x10]
 ; X86-SSE-NEXT:    movd %eax, %xmm4 # encoding: [0x66,0x0f,0x6e,0xe0]
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x10]
+; X86-SSE-NEXT:    movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8]
 ; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x14]
 ; X86-SSE-NEXT:    movd %eax, %xmm5 # encoding: [0x66,0x0f,0x6e,0xe8]
 ; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x18]
@@ -3788,18 +3788,18 @@ define <2 x i64> @test_mm_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4,
 ; X86-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
 ; X86-SSE-NEXT:    punpcklwd %xmm1, %xmm2 # encoding: [0x66,0x0f,0x61,0xd1]
 ; X86-SSE-NEXT:    # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; X86-SSE-NEXT:    punpcklwd %xmm3, %xmm4 # encoding: [0x66,0x0f,0x61,0xe3]
-; X86-SSE-NEXT:    # xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; X86-SSE-NEXT:    punpckldq %xmm2, %xmm4 # encoding: [0x66,0x0f,0x62,0xe2]
-; X86-SSE-NEXT:    # xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; X86-SSE-NEXT:    punpcklwd %xmm4, %xmm3 # encoding: [0x66,0x0f,0x61,0xdc]
+; X86-SSE-NEXT:    # xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; X86-SSE-NEXT:    punpckldq %xmm2, %xmm3 # encoding: [0x66,0x0f,0x62,0xda]
+; X86-SSE-NEXT:    # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
 ; X86-SSE-NEXT:    punpcklwd %xmm5, %xmm6 # encoding: [0x66,0x0f,0x61,0xf5]
 ; X86-SSE-NEXT:    # xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
 ; X86-SSE-NEXT:    punpcklwd %xmm7, %xmm0 # encoding: [0x66,0x0f,0x61,0xc7]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
 ; X86-SSE-NEXT:    punpckldq %xmm6, %xmm0 # encoding: [0x66,0x0f,0x62,0xc6]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
-; X86-SSE-NEXT:    punpcklqdq %xmm4, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc4]
-; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm4[0]
+; X86-SSE-NEXT:    punpcklqdq %xmm3, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc3]
+; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm3[0]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_mm_set_epi16:
@@ -4760,59 +4760,59 @@ define <2 x i64> @test_mm_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %
 ; X86-SSE-NEXT:    punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8]
 ; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x38]
-; X86-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x34]
 ; X86-SSE-NEXT:    movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0]
-; X86-SSE-NEXT:    punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0]
-; X86-SSE-NEXT:    # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X86-SSE-NEXT:    punpcklwd %xmm1, %xmm2 # encoding: [0x66,0x0f,0x61,0xd1]
-; X86-SSE-NEXT:    # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x30]
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x34]
 ; X86-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X86-SSE-NEXT:    punpcklbw %xmm2, %xmm0 # encoding: [0x66,0x0f,0x60,0xc2]
+; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X86-SSE-NEXT:    punpcklwd %xmm1, %xmm0 # encoding: [0x66,0x0f,0x61,0xc1]
+; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x30]
+; X86-SSE-NEXT:    movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8]
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x2c]
-; X86-SSE-NEXT:    movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8]
-; X86-SSE-NEXT:    punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8]
-; X86-SSE-NEXT:    # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X86-SSE-NEXT:    movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0]
+; X86-SSE-NEXT:    punpcklbw %xmm1, %xmm2 # encoding: [0x66,0x0f,0x60,0xd1]
+; X86-SSE-NEXT:    # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x28]
-; X86-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X86-SSE-NEXT:    movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8]
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x24]
 ; X86-SSE-NEXT:    movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8]
-; X86-SSE-NEXT:    punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8]
-; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X86-SSE-NEXT:    punpcklwd %xmm3, %xmm1 # encoding: [0x66,0x0f,0x61,0xcb]
-; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; X86-SSE-NEXT:    punpckldq %xmm2, %xmm1 # encoding: [0x66,0x0f,0x62,0xca]
-; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE-NEXT:    punpcklbw %xmm3, %xmm1 # encoding: [0x66,0x0f,0x60,0xcb]
+; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; X86-SSE-NEXT:    punpcklwd %xmm2, %xmm1 # encoding: [0x66,0x0f,0x61,0xca]
+; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-SSE-NEXT:    punpckldq %xmm0, %xmm1 # encoding: [0x66,0x0f,0x62,0xc8]
+; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x20]
 ; X86-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x1c]
-; X86-SSE-NEXT:    movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0]
-; X86-SSE-NEXT:    punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0]
-; X86-SSE-NEXT:    # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x18]
-; X86-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x14]
 ; X86-SSE-NEXT:    movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8]
 ; X86-SSE-NEXT:    punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8]
 ; X86-SSE-NEXT:    # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; X86-SSE-NEXT:    punpcklwd %xmm2, %xmm3 # encoding: [0x66,0x0f,0x61,0xda]
-; X86-SSE-NEXT:    # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x10]
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x18]
 ; X86-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x14]
 ; X86-SSE-NEXT:    movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0]
 ; X86-SSE-NEXT:    punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0]
 ; X86-SSE-NEXT:    # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X86-SSE-NEXT:    punpcklwd %xmm3, %xmm2 # encoding: [0x66,0x0f,0x61,0xd3]
+; X86-SSE-NEXT:    # xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x10]
+; X86-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-SSE-NEXT:    movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8]
+; X86-SSE-NEXT:    punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8]
+; X86-SSE-NEXT:    # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
 ; X86-SSE-NEXT:    movd %eax, %xmm4 # encoding: [0x66,0x0f,0x6e,0xe0]
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
 ; X86-SSE-NEXT:    punpcklbw %xmm4, %xmm0 # encoding: [0x66,0x0f,0x60,0xc4]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; X86-SSE-NEXT:    punpcklwd %xmm2, %xmm0 # encoding: [0x66,0x0f,0x61,0xc2]
-; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT:    punpckldq %xmm3, %xmm0 # encoding: [0x66,0x0f,0x62,0xc3]
-; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X86-SSE-NEXT:    punpcklwd %xmm3, %xmm0 # encoding: [0x66,0x0f,0x61,0xc3]
+; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; X86-SSE-NEXT:    punpckldq %xmm2, %xmm0 # encoding: [0x66,0x0f,0x62,0xc2]
+; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X86-SSE-NEXT:    punpcklqdq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc1]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
@@ -5192,9 +5192,9 @@ define <2 x i64> @test_mm_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4
 ; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x1c]
 ; X86-SSE-NEXT:    movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0]
 ; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x18]
-; X86-SSE-NEXT:    movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8]
-; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x14]
 ; X86-SSE-NEXT:    movd %eax, %xmm4 # encoding: [0x66,0x0f,0x6e,0xe0]
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x14]
+; X86-SSE-NEXT:    movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8]
 ; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x10]
 ; X86-SSE-NEXT:    movd %eax, %xmm5 # encoding: [0x66,0x0f,0x6e,0xe8]
 ; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x0c]
@@ -5205,18 +5205,18 @@ define <2 x i64> @test_mm_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4
 ; X86-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
 ; X86-SSE-NEXT:    punpcklwd %xmm1, %xmm2 # encoding: [0x66,0x0f,0x61,0xd1]
 ; X86-SSE-NEXT:    # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; X86-SSE-NEXT:    punpcklwd %xmm3, %xmm4 # encoding: [0x66,0x0f,0x61,0xe3]
-; X86-SSE-NEXT:    # xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; X86-SSE-NEXT:    punpckldq %xmm2, %xmm4 # encoding: [0x66,0x0f,0x62,0xe2]
-; X86-SSE-NEXT:    # xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; X86-SSE-NEXT:    punpcklwd %xmm4, %xmm3 # encoding: [0x66,0x0f,0x61,0xdc]
+; X86-SSE-NEXT:    # xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; X86-SSE-NEXT:    punpckldq %xmm2, %xmm3 # encoding: [0x66,0x0f,0x62,0xda]
+; X86-SSE-NEXT:    # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
 ; X86-SSE-NEXT:    punpcklwd %xmm5, %xmm6 # encoding: [0x66,0x0f,0x61,0xf5]
 ; X86-SSE-NEXT:    # xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
 ; X86-SSE-NEXT:    punpcklwd %xmm7, %xmm0 # encoding: [0x66,0x0f,0x61,0xc7]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
 ; X86-SSE-NEXT:    punpckldq %xmm6, %xmm0 # encoding: [0x66,0x0f,0x62,0xc6]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
-; X86-SSE-NEXT:    punpcklqdq %xmm4, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc4]
-; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm4[0]
+; X86-SSE-NEXT:    punpcklqdq %xmm3, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc3]
+; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm3[0]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_mm_setr_epi16:

diff  --git a/llvm/test/CodeGen/X86/sshl_sat.ll b/llvm/test/CodeGen/X86/sshl_sat.ll
index 040d2ce6429b5..c0d757fdd5dad 100644
--- a/llvm/test/CodeGen/X86/sshl_sat.ll
+++ b/llvm/test/CodeGen/X86/sshl_sat.ll
@@ -76,26 +76,24 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
 ;
 ; X86-LABEL: func2:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    addl %edx, %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movswl %si, %edi
-; X86-NEXT:    sarl %cl, %edi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    testw %dx, %dx
-; X86-NEXT:    sets %al
-; X86-NEXT:    addl $32767, %eax # imm = 0x7FFF
-; X86-NEXT:    cmpw %di, %dx
-; X86-NEXT:    cmovel %esi, %eax
-; X86-NEXT:    cwtl
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movswl %dx, %esi
+; X86-NEXT:    sarl %cl, %esi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    sets %cl
+; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    cmpw %si, %ax
+; X86-NEXT:    cmovel %edx, %ecx
+; X86-NEXT:    movswl %cx, %eax
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
   %x2 = sext i8 %x to i15
   %y2 = sext i8 %y to i15
@@ -128,28 +126,26 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
 ;
 ; X86-LABEL: func3:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $7, %ecx
-; X86-NEXT:    addl %edx, %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movswl %si, %edi
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movswl %dx, %esi
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    sarl %cl, %edi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    testw %dx, %dx
-; X86-NEXT:    sets %al
-; X86-NEXT:    addl $32767, %eax # imm = 0x7FFF
-; X86-NEXT:    cmpw %di, %dx
-; X86-NEXT:    cmovel %esi, %eax
-; X86-NEXT:    cwtl
+; X86-NEXT:    sarl %cl, %esi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    sets %cl
+; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    cmpw %si, %ax
+; X86-NEXT:    cmovel %edx, %ecx
+; X86-NEXT:    movswl %cx, %eax
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
   %y2 = sext i8 %y to i15
   %y3 = shl i15 %y2, 7
@@ -229,7 +225,6 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -241,26 +236,25 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    cmovnel %ebx, %esi
 ; X86-NEXT:    cmovel %ebx, %edi
-; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sarl %cl, %edx
 ; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    sarl %cl, %ebx
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    sarl $31, %ebp
+; X86-NEXT:    sarl $31, %ebx
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    cmovel %ebx, %ebp
-; X86-NEXT:    shrdl %cl, %esi, %edi
+; X86-NEXT:    cmovel %edx, %ebx
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    shrdl %cl, %esi, %ebp
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    cmovnel %ebx, %edi
-; X86-NEXT:    xorl %eax, %ebp
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmovnel %edx, %ebp
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    xorl $2147483647, %edx # imm = 0x7FFFFFFF
-; X86-NEXT:    orl %ebp, %edi
+; X86-NEXT:    orl %ebx, %ebp
 ; X86-NEXT:    notl %eax
-; X86-NEXT:    cmovel (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    cmovel %edi, %eax
 ; X86-NEXT:    cmovel %esi, %edx
-; X86-NEXT:    addl $4, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
index ad92cb9bb7334..a0d175fe9de93 100644
--- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
@@ -85,28 +85,28 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    sarl %cl, %ebp
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    shll %cl, %ebp
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    sarl %cl, %edi
 ; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    sets %bl
 ; X86-NEXT:    addl $2147483647, %ebx # imm = 0x7FFFFFFF
-; X86-NEXT:    cmpl %ebp, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmovel %edi, %ebx
-; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    cmpl %edi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmovel %ebp, %ebx
+; X86-NEXT:    movl %edi, %ebp
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    shll %cl, %ebp
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    sarl %cl, %eax
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    testl %ebp, %ebp
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    sets %dl
 ; X86-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
-; X86-NEXT:    cmpl %eax, %ebp
-; X86-NEXT:    cmovel %edi, %edx
+; X86-NEXT:    cmpl %eax, %edi
+; X86-NEXT:    cmovel %ebp, %edx
 ; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    shll %cl, %edi

diff  --git a/llvm/test/CodeGen/X86/stack-align-memcpy.ll b/llvm/test/CodeGen/X86/stack-align-memcpy.ll
index 0dbab832156f8..e8ffbab21f2aa 100644
--- a/llvm/test/CodeGen/X86/stack-align-memcpy.ll
+++ b/llvm/test/CodeGen/X86/stack-align-memcpy.ll
@@ -27,10 +27,10 @@ define void @test1(%struct.foo* nocapture %x, i32 %y) nounwind {
 ; CHECK-NEXT:    subl %edx, %eax
 ; CHECK-NEXT:    movl %eax, %esp
 ; CHECK-NEXT:    subl $4, %esp
-; CHECK-NEXT:    movl 84(%ecx), %edx
+; CHECK-NEXT:    movl 84(%ecx), %edi
+; CHECK-NEXT:    movl 80(%ecx), %ebx
+; CHECK-NEXT:    movl 76(%ecx), %edx
 ; CHECK-NEXT:    movl %edx, 68(%esi) ## 4-byte Spill
-; CHECK-NEXT:    movl 80(%ecx), %edi
-; CHECK-NEXT:    movl 76(%ecx), %ebx
 ; CHECK-NEXT:    movl 72(%ecx), %edx
 ; CHECK-NEXT:    movl %edx, 64(%esi) ## 4-byte Spill
 ; CHECK-NEXT:    movl 68(%ecx), %edx
@@ -68,9 +68,9 @@ define void @test1(%struct.foo* nocapture %x, i32 %y) nounwind {
 ; CHECK-NEXT:    movl (%ecx), %edx
 ; CHECK-NEXT:    movl %edx, 72(%esi) ## 4-byte Spill
 ; CHECK-NEXT:    movl 4(%ecx), %ecx
-; CHECK-NEXT:    pushl 68(%esi) ## 4-byte Folded Reload
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl 68(%esi) ## 4-byte Folded Reload
 ; CHECK-NEXT:    pushl 64(%esi) ## 4-byte Folded Reload
 ; CHECK-NEXT:    pushl 60(%esi) ## 4-byte Folded Reload
 ; CHECK-NEXT:    pushl 56(%esi) ## 4-byte Folded Reload
@@ -119,10 +119,10 @@ define void @test2(%struct.foo* nocapture %x, i32 %y, i8* %z) nounwind {
 ; CHECK-NEXT:    movl 12(%ebp), %edi
 ; CHECK-NEXT:    movl 8(%ebp), %eax
 ; CHECK-NEXT:    subl $4, %esp
-; CHECK-NEXT:    movl 84(%eax), %ecx
+; CHECK-NEXT:    movl 84(%eax), %edx
+; CHECK-NEXT:    movl 80(%eax), %ebx
+; CHECK-NEXT:    movl 76(%eax), %ecx
 ; CHECK-NEXT:    movl %ecx, 68(%esi) ## 4-byte Spill
-; CHECK-NEXT:    movl 80(%eax), %edx
-; CHECK-NEXT:    movl 76(%eax), %ebx
 ; CHECK-NEXT:    movl 72(%eax), %ecx
 ; CHECK-NEXT:    movl %ecx, 64(%esi) ## 4-byte Spill
 ; CHECK-NEXT:    movl 68(%eax), %ecx
@@ -160,9 +160,9 @@ define void @test2(%struct.foo* nocapture %x, i32 %y, i8* %z) nounwind {
 ; CHECK-NEXT:    movl (%eax), %ecx
 ; CHECK-NEXT:    movl %ecx, 72(%esi) ## 4-byte Spill
 ; CHECK-NEXT:    movl 4(%eax), %eax
-; CHECK-NEXT:    pushl 68(%esi) ## 4-byte Folded Reload
 ; CHECK-NEXT:    pushl %edx
 ; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl 68(%esi) ## 4-byte Folded Reload
 ; CHECK-NEXT:    pushl 64(%esi) ## 4-byte Folded Reload
 ; CHECK-NEXT:    pushl 60(%esi) ## 4-byte Folded Reload
 ; CHECK-NEXT:    pushl 56(%esi) ## 4-byte Folded Reload

diff  --git a/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll b/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll
index 3749741bb28db..8957429f5a8e0 100644
--- a/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll
+++ b/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll
@@ -53,9 +53,9 @@ define i32 @test_spill(
 
 ; CHECK-PREG:     renamable $rbx = COPY $r9
 ; CHECK-PREG:     MOV64mr %stack.6, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.6)
-; CHECK-PREG:     renamable $r15 = COPY $rcx
-; CHECK-PREG:     renamable $r12 = COPY $rdx
-; CHECK-PREG:     renamable $r14 = COPY $rsi
+; CHECK-PREG:     renamable $r12 = COPY $rcx
+; CHECK-PREG:     renamable $r14 = COPY $rdx
+; CHECK-PREG:     renamable $r15 = COPY $rsi
 ; CHECK-PREG:     renamable $r13 = COPY $rdi
 ; CHECK-PREG:     renamable $rax = MOV64rm %fixed-stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.11, align 16)
 ; CHECK-PREG:     MOV64mr %stack.7, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.7)
@@ -80,38 +80,38 @@ define i32 @test_spill(
 ; CHECK-PREG:     MOV64mr %stack.9, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.9)
 ; CHECK-PREG:     renamable $rax = MOV64rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.0)
 ; CHECK-PREG:     MOV64mr %stack.10, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.10)
-; CHECK-PREG:     renamable $rbp, renamable $rbx, renamable $r15, renamable $r12, renamable $r14, renamable $r13 = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 2, 18, 1, 8, %stack.10, 0, 1, 8, %stack.9, 0, 1, 8, %stack.8, 0, 1, 8, %stack.0, 0, 1, 8, %stack.1, 0, 1, 8, %stack.2, 0, 1, 8, %stack.3, 0, 1, 8, %stack.4, 0, 1, 8, %stack.5, 0, 1, 8, %stack.11, 0, killed renamable $rbp(tied-def 0), 1, 8, %stack.7, 0, killed renamable $rbx(tied-def 1), 1, 8, %stack.6, 0, killed renamable $r15(tied-def 2), killed renamable $r12(tied-def 3), killed renamable $r14(tied-def 4), killed renamable $r13(tied-def 5), 2, 0, 2, 18, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store (s64) on %stack.0), (load store (s64) on %stack.1), (load store (s64) on %stack.2), (load store (s64) on %stack.3), (load store (s64) on %stack.4), (load store (s64) on %stack.5), (load store (s64) on %stack.6), (load store (s64) on %stack.7), (load store (s64) on %stack.8), (load store (s64) on %stack.9), (load store (s64) on %stack.10), (load store (s64) on %stack.11)
+; CHECK-PREG:     renamable $rbp, renamable $rbx, renamable $r12, renamable $r14, renamable $r15, renamable $r13 = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 2, 18, 1, 8, %stack.10, 0, 1, 8, %stack.9, 0, 1, 8, %stack.8, 0, 1, 8, %stack.0, 0, 1, 8, %stack.1, 0, 1, 8, %stack.2, 0, 1, 8, %stack.3, 0, 1, 8, %stack.4, 0, 1, 8, %stack.5, 0, 1, 8, %stack.11, 0, killed renamable $rbp(tied-def 0), 1, 8, %stack.7, 0, killed renamable $rbx(tied-def 1), 1, 8, %stack.6, 0, killed renamable $r12(tied-def 2), killed renamable $r14(tied-def 3), killed renamable $r15(tied-def 4), killed renamable $r13(tied-def 5), 2, 0, 2, 18, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store (s64) on %stack.0), (load store (s64) on %stack.1), (load store (s64) on %stack.2), (load store (s64) on %stack.3), (load store (s64) on %stack.4), (load store (s64) on %stack.5), (load store (s64) on %stack.6), (load store (s64) on %stack.7), (load store (s64) on %stack.8), (load store (s64) on %stack.9), (load store (s64) on %stack.10), (load store (s64) on %stack.11)
 ; CHECK-PREG:     renamable $eax = MOV32rm killed renamable $r13, 1, $noreg, 4, $noreg :: (load (s32) from %ir.gep00, addrspace 1)
-; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $r14, 1, $noreg, 8, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep01, addrspace 1)
-; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $r12, 1, $noreg, 12, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep02, addrspace 1)
-; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $r15, 1, $noreg, 16, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep03, addrspace 1)
-; CHECK-PREG:     renamable $rdx = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6)
-; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdx, 1, $noreg, 20, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep04, addrspace 1)
+; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $r15, 1, $noreg, 8, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep01, addrspace 1)
+; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $r14, 1, $noreg, 12, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep02, addrspace 1)
+; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $r12, 1, $noreg, 16, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep03, addrspace 1)
+; CHECK-PREG:     renamable $rdi = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6)
+; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 20, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep04, addrspace 1)
 ; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rbx, 1, $noreg, 24, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep05, addrspace 1)
-; CHECK-PREG:     renamable $rdx = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7)
-; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdx, 1, $noreg, 28, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep06, addrspace 1)
+; CHECK-PREG:     renamable $rdi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7)
+; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 28, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep06, addrspace 1)
 ; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rbp, 1, $noreg, 32, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep07, addrspace 1)
 ; CHECK-PREG:     renamable $rcx = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11)
 ; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 36, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep08, addrspace 1)
-; CHECK-PREG:     renamable $rdx = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5)
-; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdx, 1, $noreg, 40, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep09, addrspace 1)
-; CHECK-PREG:     renamable $rdx = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4)
-; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdx, 1, $noreg, 44, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep10, addrspace 1)
-; CHECK-PREG:     renamable $rdx = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3)
-; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdx, 1, $noreg, 48, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep11, addrspace 1)
-; CHECK-PREG:     renamable $rdx = MOV64rm %stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %stack.2)
-; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdx, 1, $noreg, 52, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep12, addrspace 1)
-; CHECK-PREG:     renamable $rdx = MOV64rm %stack.1, 1, $noreg, 0, $noreg :: (load (s64) from %stack.1)
-; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdx, 1, $noreg, 56, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep13, addrspace 1)
-; CHECK-PREG:     renamable $rdx = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0)
-; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdx, 1, $noreg, 60, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep14, addrspace 1)
-; CHECK-PREG:     renamable $rdx = MOV64rm %stack.8, 1, $noreg, 0, $noreg :: (load (s64) from %stack.8)
-; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdx, 1, $noreg, 64, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep15, addrspace 1)
+; CHECK-PREG:     renamable $rdi = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5)
+; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 40, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep09, addrspace 1)
+; CHECK-PREG:     renamable $rdi = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4)
+; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 44, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep10, addrspace 1)
+; CHECK-PREG:     renamable $rdi = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3)
+; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 48, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep11, addrspace 1)
+; CHECK-PREG:     renamable $rdi = MOV64rm %stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %stack.2)
+; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 52, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep12, addrspace 1)
+; CHECK-PREG:     renamable $rdi = MOV64rm %stack.1, 1, $noreg, 0, $noreg :: (load (s64) from %stack.1)
+; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 56, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep13, addrspace 1)
+; CHECK-PREG:     renamable $rdi = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0)
+; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 60, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep14, addrspace 1)
+; CHECK-PREG:     renamable $rsi = MOV64rm %stack.8, 1, $noreg, 0, $noreg :: (load (s64) from %stack.8)
+; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rsi, 1, $noreg, 64, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep15, addrspace 1)
 ; CHECK-PREG:     renamable $rdx = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9)
 ; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdx, 1, $noreg, 68, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep16, addrspace 1)
 ; CHECK-PREG:     renamable $rcx = MOV64rm %stack.10, 1, $noreg, 0, $noreg :: (load (s64) from %stack.10)
 ; CHECK-PREG:     renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 72, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep17, addrspace 1)
- 
+
     %token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0) [ "gc-live"(i32 addrspace(1)* %arg00, i32 addrspace(1)* %arg01, i32 addrspace(1)* %arg02, i32 addrspace(1)* %arg03, i32 addrspace(1)* %arg04, i32 addrspace(1)* %arg05, i32 addrspace(1)* %arg06, i32 addrspace(1)* %arg07, i32 addrspace(1)* %arg08,
     i32 addrspace(1)* %arg09, i32 addrspace(1)* %arg10, i32 addrspace(1)* %arg11, i32 addrspace(1)* %arg12, i32 addrspace(1)* %arg13, i32 addrspace(1)* %arg14, i32 addrspace(1)* %arg15, i32 addrspace(1)* %arg16, i32 addrspace(1)* %arg17) ]
     %rel00 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %token, i32 0, i32 0) ; (%arg00, %arg00)

diff  --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll
index 231d9618e6afc..bd807c9e76c64 100644
--- a/llvm/test/CodeGen/X86/subvector-broadcast.ll
+++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll
@@ -803,24 +803,24 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>*
 define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
 ; X86-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
 ; X86-AVX1:       # %bb.0: # %entry
-; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,2,0]
-; X86-AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm4
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,0,2,0]
+; X86-AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm3
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [3,0,4,0]
 ; X86-AVX1-NEXT:    vpaddq %xmm5, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vmovaps {{.*#+}} ymm6 = [1,0,2,0,3,0,4,0]
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
 ; X86-AVX1-NEXT:    vpaddq %xmm5, %xmm7, %xmm7
-; X86-AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm2, %ymm2
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
 ; X86-AVX1-NEXT:    vpaddq %xmm5, %xmm7, %xmm5
-; X86-AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpaddq %xmm4, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
 ; X86-AVX1-NEXT:    vandps %ymm6, %ymm1, %ymm1
 ; X86-AVX1-NEXT:    vandps %ymm6, %ymm2, %ymm2
 ; X86-AVX1-NEXT:    vmovdqu %xmm0, ga4+16
-; X86-AVX1-NEXT:    vmovdqu %xmm4, ga4
+; X86-AVX1-NEXT:    vmovdqu %xmm3, ga4
 ; X86-AVX1-NEXT:    vmovups %ymm2, gb4+32
 ; X86-AVX1-NEXT:    vmovups %ymm1, gb4
 ; X86-AVX1-NEXT:    vzeroupper

diff  --git a/llvm/test/CodeGen/X86/uadd_sat.ll b/llvm/test/CodeGen/X86/uadd_sat.ll
index 62d098472f2ea..39ad8ab47bcd7 100644
--- a/llvm/test/CodeGen/X86/uadd_sat.ll
+++ b/llvm/test/CodeGen/X86/uadd_sat.ll
@@ -126,23 +126,23 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $-1, %ebx
-; X86-NEXT:    cmovbl %ebx, %edi
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl %ebx, %esi
+; X86-NEXT:    cmovbl %ebx, %ecx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    cmovbl %ebx, %edx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovbl %ebx, %ecx
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmovbl %ebx, %esi
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmovbl %ebx, %edi
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
index 91fa594f65543..2c0aee95b18b5 100644
--- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
@@ -392,16 +392,16 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    setb %al
-; X86-NEXT:    shldl $31, %ecx, %eax
-; X86-NEXT:    shll $31, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    setb %cl
+; X86-NEXT:    shldl $31, %eax, %ecx
+; X86-NEXT:    shll $31, %eax
 ; X86-NEXT:    pushl $0
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
 ; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __udivdi3
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -444,32 +444,32 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    calll __udivdi3
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    cmpl $2, %edx
-; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    cmovael %ecx, %eax
+; X86-NEXT:    movl $-1, %esi
+; X86-NEXT:    cmovael %esi, %eax
 ; X86-NEXT:    cmpl $1, %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    cmovael %esi, %edx
+; X86-NEXT:    movl $1, %ecx
+; X86-NEXT:    cmovael %ecx, %edx
 ; X86-NEXT:    shldl $31, %eax, %edx
 ; X86-NEXT:    cmpl $2, %edi
-; X86-NEXT:    cmovael %ecx, %ebx
+; X86-NEXT:    cmovael %esi, %ebx
 ; X86-NEXT:    cmpl $1, %edi
-; X86-NEXT:    cmovael %esi, %edi
+; X86-NEXT:    cmovael %ecx, %edi
 ; X86-NEXT:    shldl $31, %ebx, %edi
 ; X86-NEXT:    cmpl $2, %ebp
 ; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    cmovael %ecx, %eax
+; X86-NEXT:    cmovael %esi, %eax
 ; X86-NEXT:    cmpl $1, %ebp
-; X86-NEXT:    cmovael %esi, %ebp
+; X86-NEXT:    cmovael %ecx, %ebp
 ; X86-NEXT:    shldl $31, %eax, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    cmpl $2, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    cmovael %ecx, %eax
+; X86-NEXT:    cmovael %esi, %eax
 ; X86-NEXT:    cmpl $1, %ebx
-; X86-NEXT:    cmovbl %ebx, %esi
-; X86-NEXT:    shldl $31, %eax, %esi
+; X86-NEXT:    cmovbl %ebx, %ecx
+; X86-NEXT:    shldl $31, %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    movl %ebp, 8(%eax)
 ; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %edx, (%eax)

diff  --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll
index 38052f339af7c..c0bff04f401b0 100644
--- a/llvm/test/CodeGen/X86/umax.ll
+++ b/llvm/test/CodeGen/X86/umax.ll
@@ -154,50 +154,50 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %ebx, %edx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    cmoval %edx, %eax
-; X86-NEXT:    cmpl %esi, %ecx
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    cmoval %edx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    cmoval %esi, %eax
+; X86-NEXT:    cmpl %edx, %edi
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    cmoval %esi, %ebp
 ; X86-NEXT:    cmovel %eax, %ebp
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    cmoval %ecx, %eax
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    cmoval %edi, %eax
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %edi, %ecx
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ebx, %edi
 ; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    cmovel %ebp, %ebx
-; X86-NEXT:    cmovel (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl %eax, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    cmoval %edi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    xorl %esi, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    cmovel %ebp, %ecx
+; X86-NEXT:    cmovel (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    cmpl %esi, %ebx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    cmoval %ebx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl %edx, %ebp
-; X86-NEXT:    cmoval %edi, %eax
-; X86-NEXT:    cmovel %ecx, %eax
-; X86-NEXT:    cmoval %ebp, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %edx, 12(%ecx)
-; X86-NEXT:    movl %eax, 8(%ecx)
-; X86-NEXT:    movl %esi, 4(%ecx)
-; X86-NEXT:    movl %ebx, (%ecx)
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    cmpl %eax, %ebp
+; X86-NEXT:    cmoval %ebx, %esi
+; X86-NEXT:    cmovel %edi, %esi
+; X86-NEXT:    cmoval %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %eax, 12(%edi)
+; X86-NEXT:    movl %esi, 8(%edi)
+; X86-NEXT:    movl %edx, 4(%edi)
+; X86-NEXT:    movl %ecx, (%edi)
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    addl $4, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi

diff  --git a/llvm/test/CodeGen/X86/umin.ll b/llvm/test/CodeGen/X86/umin.ll
index 84170d75f67b7..45647bf1b42af 100644
--- a/llvm/test/CodeGen/X86/umin.ll
+++ b/llvm/test/CodeGen/X86/umin.ll
@@ -157,47 +157,46 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpl %ecx, %edi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    cmovbl %edi, %eax
+; X86-NEXT:    cmpl %esi, %ebp
 ; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    cmovbl %eax, %ebx
-; X86-NEXT:    cmpl %esi, %edi
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    cmovbl %eax, %ebp
-; X86-NEXT:    cmovel %ebx, %ebp
+; X86-NEXT:    cmovbl %edi, %ebx
+; X86-NEXT:    cmovel %eax, %ebx
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    cmovbl %edi, %eax
+; X86-NEXT:    cmovbl %ebp, %eax
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    cmpl %edx, %edi
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    cmovbl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    sbbl %edi, %ebp
 ; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    cmovel %ebp, %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    xorl %edi, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    orl %ebp, %eax
+; X86-NEXT:    cmovel %ebx, %ecx
 ; X86-NEXT:    cmovel (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %edi, %eax
 ; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovbl %edi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %eax, 12(%edi)
-; X86-NEXT:    movl %edx, 8(%edi)
-; X86-NEXT:    movl %esi, 4(%edi)
-; X86-NEXT:    movl %ecx, (%edi)
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    cmovbl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi

diff  --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll
index e5c61c418a705..3afddbb7f0e4d 100644
--- a/llvm/test/CodeGen/X86/umul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll
@@ -92,7 +92,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
@@ -105,7 +105,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %ebx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -141,12 +141,12 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
@@ -175,13 +175,13 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ecx
 ; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
@@ -200,16 +200,16 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    movzbl (%esp), %esi # 1-byte Folded Reload
 ; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
@@ -220,7 +220,8 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
@@ -229,75 +230,76 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movzbl %bl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %cl
+; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %ebp, %esi
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
@@ -365,7 +367,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebx
@@ -377,7 +379,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %edi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -406,44 +408,43 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    adcl %edi, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebp, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edi
 ; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %ecx, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull %ecx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -451,66 +452,66 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    imull %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    addl %ebp, %esi
 ; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    imull %edx, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    imull %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    imull %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, (%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 4(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 8(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 12(%ecx)
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 16(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 20(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 24(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 28(%ecx)
-; X86-NEXT:    movl %eax, 32(%ecx)
-; X86-NEXT:    andl $4095, %ebx # imm = 0xFFF
-; X86-NEXT:    movw %bx, 36(%ecx)
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, (%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 4(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 8(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 12(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 16(%edx)
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 20(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 24(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 28(%edx)
+; X86-NEXT:    movl %eax, 32(%edx)
+; X86-NEXT:    andl $4095, %ecx # imm = 0xFFF
+; X86-NEXT:    movw %cx, 36(%edx)
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi

diff  --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll
index 37cdc49bdfad6..ad2450a19e006 100644
--- a/llvm/test/CodeGen/X86/umul_fix.ll
+++ b/llvm/test/CodeGen/X86/umul_fix.ll
@@ -43,27 +43,27 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    shldl $30, %eax, %esi
-; X86-NEXT:    shldl $30, %ecx, %eax
-; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    shldl $30, %eax, %ecx
+; X86-NEXT:    shldl $30, %esi, %eax
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -122,28 +122,28 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    shldl $30, %eax, %ebp
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shldl $30, %eax, %esi
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    shldl $30, %eax, %ebx
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    shldl $30, %eax, %edi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    shldl $30, %eax, %ebp
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    shldl $30, %eax, %edx
 ; X86-NEXT:    movl %edx, 12(%ecx)
-; X86-NEXT:    movl %edi, 8(%ecx)
+; X86-NEXT:    movl %ebp, 8(%ecx)
 ; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl %ebp, (%ecx)
+; X86-NEXT:    movl %esi, (%ecx)
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -267,23 +267,23 @@ define i64 @func7(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -308,24 +308,24 @@ define i64 @func8(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    addl %ebp, %edx
 ; X86-NEXT:    adcl $0, %ecx
@@ -355,31 +355,30 @@ define i64 @func9(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll
index ad980b961bc6a..ad022545e51e7 100644
--- a/llvm/test/CodeGen/X86/umul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll
@@ -52,32 +52,32 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    shrdl $2, %eax, %ecx
 ; X86-NEXT:    shrdl $2, %edx, %eax
 ; X86-NEXT:    shrl $2, %edx
-; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    movl $-1, %edx
 ; X86-NEXT:    cmovnel %edx, %ecx
 ; X86-NEXT:    cmovel %eax, %edx
@@ -195,8 +195,8 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %esi
@@ -204,26 +204,26 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    cmpl $4, %edx
 ; X86-NEXT:    movl $-1, %ecx
 ; X86-NEXT:    cmovael %ecx, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    shrdl $2, %edx, %ebp
-; X86-NEXT:    cmpl $4, %edx
-; X86-NEXT:    cmovael %ecx, %ebp
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    shrdl $2, %edx, %ebx
 ; X86-NEXT:    cmpl $4, %edx
 ; X86-NEXT:    cmovael %ecx, %ebx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    shrdl $2, %edx, %ebp
+; X86-NEXT:    cmpl $4, %edx
+; X86-NEXT:    cmovael %ecx, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    shrdl $2, %edx, %eax
 ; X86-NEXT:    cmpl $4, %edx
 ; X86-NEXT:    cmovael %ecx, %eax
 ; X86-NEXT:    movl %eax, 12(%edi)
-; X86-NEXT:    movl %ebx, 8(%edi)
-; X86-NEXT:    movl %ebp, 4(%edi)
+; X86-NEXT:    movl %ebp, 8(%edi)
+; X86-NEXT:    movl %ebx, 4(%edi)
 ; X86-NEXT:    movl %esi, (%edi)
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    popl %esi
@@ -279,29 +279,31 @@ define i64 @func5(i64 %x, i64 %y) {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    setne %dl
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %cl
-; X86-NEXT:    andb %dl, %cl
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    setne %bl
+; X86-NEXT:    andb %dl, %bl
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    seto %bl
+; X86-NEXT:    seto %bh
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    seto %ch
-; X86-NEXT:    orb %bl, %ch
+; X86-NEXT:    seto %cl
+; X86-NEXT:    orb %bh, %cl
 ; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    setb %bl
-; X86-NEXT:    orb %ch, %bl
-; X86-NEXT:    orb %cl, %bl
+; X86-NEXT:    setb %ch
+; X86-NEXT:    orb %cl, %ch
+; X86-NEXT:    orb %bl, %ch
 ; X86-NEXT:    movl $-1, %ecx
 ; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    cmovnel %ecx, %edx
@@ -393,28 +395,28 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    movl $-1, %esi
-; X86-NEXT:    cmovol %esi, %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl $-1, %edi
+; X86-NEXT:    cmovol %edi, %esi
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    cmovol %esi, %ebx
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    cmovol %edi, %ebx
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    cmovol %esi, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    cmovol %edi, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovol %esi, %eax
+; X86-NEXT:    cmovol %edi, %eax
 ; X86-NEXT:    movl %eax, 12(%ecx)
-; X86-NEXT:    movl %edi, 8(%ecx)
+; X86-NEXT:    movl %ebp, 8(%ecx)
 ; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl %ebp, (%ecx)
+; X86-NEXT:    movl %esi, (%ecx)
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -443,31 +445,30 @@ define i64 @func7(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    cmpl $1, %ebx
+; X86-NEXT:    cmpl $1, %edi
 ; X86-NEXT:    sbbl %ecx, %ecx
 ; X86-NEXT:    notl %ecx
 ; X86-NEXT:    orl %ecx, %eax
@@ -500,24 +501,24 @@ define i64 @func8(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    adcl $0, %ecx

diff  --git a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
index 044e5f2da288b..f5b5d9886353f 100644
--- a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
@@ -16,25 +16,27 @@ define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    setne %dl
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setne %bl
 ; X86-NEXT:    andb %dl, %bl
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    seto %cl
+; X86-NEXT:    seto %bh
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    seto %ch
-; X86-NEXT:    orb %cl, %ch
+; X86-NEXT:    orb %bh, %ch
 ; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    orb %ch, %cl

diff  --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
index e52d4a419cb51..e69901d3f602d 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
@@ -993,16 +993,16 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-BASELINE-NEXT:    notl %r14d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r14w
 ; CHECK-BASELINE-NEXT:    orl %ebx, %r14d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT:    andw %r11w, %bx
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    andw %r11w, %di
 ; CHECK-BASELINE-NEXT:    notl %r11d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
-; CHECK-BASELINE-NEXT:    orl %ebx, %r11d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT:    andw %r10w, %bx
+; CHECK-BASELINE-NEXT:    orl %edi, %r11d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    andw %r10w, %di
 ; CHECK-BASELINE-NEXT:    notl %r10d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
-; CHECK-BASELINE-NEXT:    orl %ebx, %r10d
+; CHECK-BASELINE-NEXT:    orl %edi, %r10d
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
 ; CHECK-BASELINE-NEXT:    andl %ebx, %r9d
 ; CHECK-BASELINE-NEXT:    notl %ebx
@@ -1055,16 +1055,16 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-SSE1-NEXT:    notl %r14d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r14w
 ; CHECK-SSE1-NEXT:    orl %ebx, %r14d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT:    andw %r11w, %bx
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    andw %r11w, %di
 ; CHECK-SSE1-NEXT:    notl %r11d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
-; CHECK-SSE1-NEXT:    orl %ebx, %r11d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT:    andw %r10w, %bx
+; CHECK-SSE1-NEXT:    orl %edi, %r11d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    andw %r10w, %di
 ; CHECK-SSE1-NEXT:    notl %r10d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
-; CHECK-SSE1-NEXT:    orl %ebx, %r10d
+; CHECK-SSE1-NEXT:    orl %edi, %r10d
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
 ; CHECK-SSE1-NEXT:    andl %ebx, %r9d
 ; CHECK-SSE1-NEXT:    notl %ebx
@@ -1320,7 +1320,7 @@ define <32 x i8> @out_v32i8(<32 x i8> *%px, <32 x i8> *%py, <32 x i8> *%pmask) n
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
 ; CHECK-BASELINE-NEXT:    movq %rcx, %r15
-; CHECK-BASELINE-NEXT:    movq %rsi, %r14
+; CHECK-BASELINE-NEXT:    movq %rsi, %r12
 ; CHECK-BASELINE-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-BASELINE-NEXT:    movb 16(%rcx), %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
@@ -1332,7 +1332,7 @@ define <32 x i8> @out_v32i8(<32 x i8> *%px, <32 x i8> *%py, <32 x i8> *%pmask) n
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-BASELINE-NEXT:    movb 20(%rcx), %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 21(%rcx), %r12b
+; CHECK-BASELINE-NEXT:    movb 21(%rcx), %r14b
 ; CHECK-BASELINE-NEXT:    movb 22(%rcx), %r9b
 ; CHECK-BASELINE-NEXT:    movb 23(%rcx), %r10b
 ; CHECK-BASELINE-NEXT:    movb 24(%rcx), %r11b
@@ -1343,94 +1343,94 @@ define <32 x i8> @out_v32i8(<32 x i8> *%px, <32 x i8> *%py, <32 x i8> *%pmask) n
 ; CHECK-BASELINE-NEXT:    movb 29(%rcx), %sil
 ; CHECK-BASELINE-NEXT:    movb 30(%rcx), %bl
 ; CHECK-BASELINE-NEXT:    movb 31(%rcx), %al
-; CHECK-BASELINE-NEXT:    movb 31(%r14), %cl
+; CHECK-BASELINE-NEXT:    movb 31(%r12), %cl
 ; CHECK-BASELINE-NEXT:    andb %al, %cl
 ; CHECK-BASELINE-NEXT:    notb %al
 ; CHECK-BASELINE-NEXT:    andb 31(%rdx), %al
 ; CHECK-BASELINE-NEXT:    orb %cl, %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 30(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 30(%r12), %al
 ; CHECK-BASELINE-NEXT:    andb %bl, %al
 ; CHECK-BASELINE-NEXT:    notb %bl
 ; CHECK-BASELINE-NEXT:    andb 30(%rdx), %bl
 ; CHECK-BASELINE-NEXT:    orb %al, %bl
 ; CHECK-BASELINE-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 29(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 29(%r12), %al
 ; CHECK-BASELINE-NEXT:    andb %sil, %al
 ; CHECK-BASELINE-NEXT:    notb %sil
 ; CHECK-BASELINE-NEXT:    andb 29(%rdx), %sil
 ; CHECK-BASELINE-NEXT:    orb %al, %sil
 ; CHECK-BASELINE-NEXT:    movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 28(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 28(%r12), %al
 ; CHECK-BASELINE-NEXT:    andb %dil, %al
 ; CHECK-BASELINE-NEXT:    notb %dil
 ; CHECK-BASELINE-NEXT:    andb 28(%rdx), %dil
 ; CHECK-BASELINE-NEXT:    orb %al, %dil
 ; CHECK-BASELINE-NEXT:    movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 27(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 27(%r12), %al
 ; CHECK-BASELINE-NEXT:    andb %r8b, %al
 ; CHECK-BASELINE-NEXT:    notb %r8b
 ; CHECK-BASELINE-NEXT:    andb 27(%rdx), %r8b
 ; CHECK-BASELINE-NEXT:    orb %al, %r8b
 ; CHECK-BASELINE-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 26(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 26(%r12), %al
 ; CHECK-BASELINE-NEXT:    andb %r13b, %al
 ; CHECK-BASELINE-NEXT:    notb %r13b
 ; CHECK-BASELINE-NEXT:    andb 26(%rdx), %r13b
 ; CHECK-BASELINE-NEXT:    orb %al, %r13b
 ; CHECK-BASELINE-NEXT:    movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 25(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 25(%r12), %al
 ; CHECK-BASELINE-NEXT:    andb %bpl, %al
 ; CHECK-BASELINE-NEXT:    notb %bpl
 ; CHECK-BASELINE-NEXT:    andb 25(%rdx), %bpl
 ; CHECK-BASELINE-NEXT:    orb %al, %bpl
 ; CHECK-BASELINE-NEXT:    movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 24(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 24(%r12), %al
 ; CHECK-BASELINE-NEXT:    andb %r11b, %al
 ; CHECK-BASELINE-NEXT:    notb %r11b
 ; CHECK-BASELINE-NEXT:    andb 24(%rdx), %r11b
 ; CHECK-BASELINE-NEXT:    orb %al, %r11b
 ; CHECK-BASELINE-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 23(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 23(%r12), %al
 ; CHECK-BASELINE-NEXT:    andb %r10b, %al
 ; CHECK-BASELINE-NEXT:    notb %r10b
 ; CHECK-BASELINE-NEXT:    andb 23(%rdx), %r10b
 ; CHECK-BASELINE-NEXT:    orb %al, %r10b
 ; CHECK-BASELINE-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 22(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 22(%r12), %al
 ; CHECK-BASELINE-NEXT:    andb %r9b, %al
 ; CHECK-BASELINE-NEXT:    notb %r9b
 ; CHECK-BASELINE-NEXT:    andb 22(%rdx), %r9b
 ; CHECK-BASELINE-NEXT:    orb %al, %r9b
 ; CHECK-BASELINE-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 21(%r14), %al
-; CHECK-BASELINE-NEXT:    andb %r12b, %al
-; CHECK-BASELINE-NEXT:    notb %r12b
-; CHECK-BASELINE-NEXT:    andb 21(%rdx), %r12b
-; CHECK-BASELINE-NEXT:    orb %al, %r12b
-; CHECK-BASELINE-NEXT:    movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 20(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 21(%r12), %al
+; CHECK-BASELINE-NEXT:    andb %r14b, %al
+; CHECK-BASELINE-NEXT:    notb %r14b
+; CHECK-BASELINE-NEXT:    andb 21(%rdx), %r14b
+; CHECK-BASELINE-NEXT:    orb %al, %r14b
+; CHECK-BASELINE-NEXT:    movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 20(%r12), %al
 ; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
 ; CHECK-BASELINE-NEXT:    andb %cl, %al
 ; CHECK-BASELINE-NEXT:    notb %cl
 ; CHECK-BASELINE-NEXT:    andb 20(%rdx), %cl
 ; CHECK-BASELINE-NEXT:    orb %al, %cl
 ; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 19(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 19(%r12), %al
 ; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
 ; CHECK-BASELINE-NEXT:    andb %cl, %al
 ; CHECK-BASELINE-NEXT:    notb %cl
 ; CHECK-BASELINE-NEXT:    andb 19(%rdx), %cl
 ; CHECK-BASELINE-NEXT:    orb %al, %cl
 ; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 18(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 18(%r12), %al
 ; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
 ; CHECK-BASELINE-NEXT:    andb %cl, %al
 ; CHECK-BASELINE-NEXT:    notb %cl
 ; CHECK-BASELINE-NEXT:    andb 18(%rdx), %cl
 ; CHECK-BASELINE-NEXT:    orb %al, %cl
 ; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 17(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 17(%r12), %al
 ; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
 ; CHECK-BASELINE-NEXT:    andb %cl, %al
 ; CHECK-BASELINE-NEXT:    notb %cl
@@ -1438,7 +1438,7 @@ define <32 x i8> @out_v32i8(<32 x i8> *%px, <32 x i8> *%py, <32 x i8> *%pmask) n
 ; CHECK-BASELINE-NEXT:    andb 17(%rdx), %cl
 ; CHECK-BASELINE-NEXT:    orb %al, %cl
 ; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 16(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 16(%r12), %al
 ; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
 ; CHECK-BASELINE-NEXT:    andb %cl, %al
 ; CHECK-BASELINE-NEXT:    notb %cl
@@ -1446,105 +1446,105 @@ define <32 x i8> @out_v32i8(<32 x i8> *%px, <32 x i8> *%py, <32 x i8> *%pmask) n
 ; CHECK-BASELINE-NEXT:    orb %al, %cl
 ; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-BASELINE-NEXT:    movb 15(%r15), %cl
-; CHECK-BASELINE-NEXT:    movb 15(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 15(%r12), %al
 ; CHECK-BASELINE-NEXT:    andb %cl, %al
 ; CHECK-BASELINE-NEXT:    notb %cl
 ; CHECK-BASELINE-NEXT:    andb 15(%rdx), %cl
 ; CHECK-BASELINE-NEXT:    orb %al, %cl
 ; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-BASELINE-NEXT:    movb 14(%r15), %cl
-; CHECK-BASELINE-NEXT:    movb 14(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 14(%r12), %al
 ; CHECK-BASELINE-NEXT:    andb %cl, %al
 ; CHECK-BASELINE-NEXT:    notb %cl
 ; CHECK-BASELINE-NEXT:    andb 14(%rdx), %cl
 ; CHECK-BASELINE-NEXT:    orb %al, %cl
 ; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-BASELINE-NEXT:    movb 13(%r15), %cl
-; CHECK-BASELINE-NEXT:    movb 13(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 13(%r12), %al
 ; CHECK-BASELINE-NEXT:    andb %cl, %al
 ; CHECK-BASELINE-NEXT:    notb %cl
 ; CHECK-BASELINE-NEXT:    andb 13(%rdx), %cl
 ; CHECK-BASELINE-NEXT:    orb %al, %cl
 ; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-BASELINE-NEXT:    movb 12(%r15), %cl
-; CHECK-BASELINE-NEXT:    movb 12(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 12(%r12), %al
 ; CHECK-BASELINE-NEXT:    andb %cl, %al
 ; CHECK-BASELINE-NEXT:    notb %cl
 ; CHECK-BASELINE-NEXT:    andb 12(%rdx), %cl
 ; CHECK-BASELINE-NEXT:    orb %al, %cl
 ; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-BASELINE-NEXT:    movb 11(%r15), %r13b
-; CHECK-BASELINE-NEXT:    movb 11(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 11(%r12), %al
 ; CHECK-BASELINE-NEXT:    andb %r13b, %al
 ; CHECK-BASELINE-NEXT:    notb %r13b
 ; CHECK-BASELINE-NEXT:    andb 11(%rdx), %r13b
 ; CHECK-BASELINE-NEXT:    orb %al, %r13b
-; CHECK-BASELINE-NEXT:    movb 10(%r15), %r12b
-; CHECK-BASELINE-NEXT:    movb 10(%r14), %al
-; CHECK-BASELINE-NEXT:    andb %r12b, %al
-; CHECK-BASELINE-NEXT:    notb %r12b
-; CHECK-BASELINE-NEXT:    andb 10(%rdx), %r12b
-; CHECK-BASELINE-NEXT:    orb %al, %r12b
+; CHECK-BASELINE-NEXT:    movb 10(%r15), %r14b
+; CHECK-BASELINE-NEXT:    movb 10(%r12), %al
+; CHECK-BASELINE-NEXT:    andb %r14b, %al
+; CHECK-BASELINE-NEXT:    notb %r14b
+; CHECK-BASELINE-NEXT:    andb 10(%rdx), %r14b
+; CHECK-BASELINE-NEXT:    orb %al, %r14b
 ; CHECK-BASELINE-NEXT:    movb 9(%r15), %bpl
-; CHECK-BASELINE-NEXT:    movb 9(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 9(%r12), %al
 ; CHECK-BASELINE-NEXT:    andb %bpl, %al
 ; CHECK-BASELINE-NEXT:    notb %bpl
 ; CHECK-BASELINE-NEXT:    andb 9(%rdx), %bpl
 ; CHECK-BASELINE-NEXT:    orb %al, %bpl
 ; CHECK-BASELINE-NEXT:    movb 8(%r15), %r11b
-; CHECK-BASELINE-NEXT:    movb 8(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 8(%r12), %al
 ; CHECK-BASELINE-NEXT:    andb %r11b, %al
 ; CHECK-BASELINE-NEXT:    notb %r11b
 ; CHECK-BASELINE-NEXT:    andb 8(%rdx), %r11b
 ; CHECK-BASELINE-NEXT:    orb %al, %r11b
 ; CHECK-BASELINE-NEXT:    movb 7(%r15), %r10b
-; CHECK-BASELINE-NEXT:    movb 7(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 7(%r12), %al
 ; CHECK-BASELINE-NEXT:    andb %r10b, %al
 ; CHECK-BASELINE-NEXT:    notb %r10b
 ; CHECK-BASELINE-NEXT:    andb 7(%rdx), %r10b
 ; CHECK-BASELINE-NEXT:    orb %al, %r10b
 ; CHECK-BASELINE-NEXT:    movb 6(%r15), %r9b
-; CHECK-BASELINE-NEXT:    movb 6(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 6(%r12), %al
 ; CHECK-BASELINE-NEXT:    andb %r9b, %al
 ; CHECK-BASELINE-NEXT:    notb %r9b
 ; CHECK-BASELINE-NEXT:    andb 6(%rdx), %r9b
 ; CHECK-BASELINE-NEXT:    orb %al, %r9b
 ; CHECK-BASELINE-NEXT:    movb 5(%r15), %r8b
-; CHECK-BASELINE-NEXT:    movb 5(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 5(%r12), %al
 ; CHECK-BASELINE-NEXT:    andb %r8b, %al
 ; CHECK-BASELINE-NEXT:    notb %r8b
 ; CHECK-BASELINE-NEXT:    andb 5(%rdx), %r8b
 ; CHECK-BASELINE-NEXT:    orb %al, %r8b
 ; CHECK-BASELINE-NEXT:    movb 4(%r15), %dil
-; CHECK-BASELINE-NEXT:    movb 4(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 4(%r12), %al
 ; CHECK-BASELINE-NEXT:    andb %dil, %al
 ; CHECK-BASELINE-NEXT:    notb %dil
 ; CHECK-BASELINE-NEXT:    andb 4(%rdx), %dil
 ; CHECK-BASELINE-NEXT:    orb %al, %dil
 ; CHECK-BASELINE-NEXT:    movb 3(%r15), %sil
-; CHECK-BASELINE-NEXT:    movb 3(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 3(%r12), %al
 ; CHECK-BASELINE-NEXT:    andb %sil, %al
 ; CHECK-BASELINE-NEXT:    notb %sil
 ; CHECK-BASELINE-NEXT:    andb 3(%rdx), %sil
 ; CHECK-BASELINE-NEXT:    orb %al, %sil
 ; CHECK-BASELINE-NEXT:    movb 2(%r15), %dl
-; CHECK-BASELINE-NEXT:    movb 2(%r14), %al
+; CHECK-BASELINE-NEXT:    movb 2(%r12), %al
 ; CHECK-BASELINE-NEXT:    andb %dl, %al
 ; CHECK-BASELINE-NEXT:    notb %dl
 ; CHECK-BASELINE-NEXT:    andb 2(%rbx), %dl
 ; CHECK-BASELINE-NEXT:    orb %al, %dl
 ; CHECK-BASELINE-NEXT:    movb 1(%r15), %al
-; CHECK-BASELINE-NEXT:    movb 1(%r14), %cl
+; CHECK-BASELINE-NEXT:    movb 1(%r12), %cl
 ; CHECK-BASELINE-NEXT:    andb %al, %cl
 ; CHECK-BASELINE-NEXT:    notb %al
 ; CHECK-BASELINE-NEXT:    andb 1(%rbx), %al
 ; CHECK-BASELINE-NEXT:    orb %cl, %al
 ; CHECK-BASELINE-NEXT:    movb (%r15), %r15b
-; CHECK-BASELINE-NEXT:    movb (%r14), %r14b
-; CHECK-BASELINE-NEXT:    andb %r15b, %r14b
+; CHECK-BASELINE-NEXT:    movb (%r12), %cl
+; CHECK-BASELINE-NEXT:    andb %r15b, %cl
 ; CHECK-BASELINE-NEXT:    notb %r15b
 ; CHECK-BASELINE-NEXT:    andb (%rbx), %r15b
-; CHECK-BASELINE-NEXT:    orb %r14b, %r15b
+; CHECK-BASELINE-NEXT:    orb %cl, %r15b
 ; CHECK-BASELINE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
 ; CHECK-BASELINE-NEXT:    movb %bl, 31(%rcx)
@@ -1587,7 +1587,7 @@ define <32 x i8> @out_v32i8(<32 x i8> *%px, <32 x i8> *%py, <32 x i8> *%pmask) n
 ; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
 ; CHECK-BASELINE-NEXT:    movb %bl, 12(%rcx)
 ; CHECK-BASELINE-NEXT:    movb %r13b, 11(%rcx)
-; CHECK-BASELINE-NEXT:    movb %r12b, 10(%rcx)
+; CHECK-BASELINE-NEXT:    movb %r14b, 10(%rcx)
 ; CHECK-BASELINE-NEXT:    movb %bpl, 9(%rcx)
 ; CHECK-BASELINE-NEXT:    movb %r11b, 8(%rcx)
 ; CHECK-BASELINE-NEXT:    movb %r10b, 7(%rcx)
@@ -1616,7 +1616,7 @@ define <32 x i8> @out_v32i8(<32 x i8> *%px, <32 x i8> *%py, <32 x i8> *%pmask) n
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
 ; CHECK-SSE1-NEXT:    movq %rcx, %r15
-; CHECK-SSE1-NEXT:    movq %rsi, %r14
+; CHECK-SSE1-NEXT:    movq %rsi, %r12
 ; CHECK-SSE1-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-SSE1-NEXT:    movb 16(%rcx), %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
@@ -1628,7 +1628,7 @@ define <32 x i8> @out_v32i8(<32 x i8> *%px, <32 x i8> *%py, <32 x i8> *%pmask) n
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-SSE1-NEXT:    movb 20(%rcx), %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 21(%rcx), %r12b
+; CHECK-SSE1-NEXT:    movb 21(%rcx), %r14b
 ; CHECK-SSE1-NEXT:    movb 22(%rcx), %r9b
 ; CHECK-SSE1-NEXT:    movb 23(%rcx), %r10b
 ; CHECK-SSE1-NEXT:    movb 24(%rcx), %r11b
@@ -1639,94 +1639,94 @@ define <32 x i8> @out_v32i8(<32 x i8> *%px, <32 x i8> *%py, <32 x i8> *%pmask) n
 ; CHECK-SSE1-NEXT:    movb 29(%rcx), %sil
 ; CHECK-SSE1-NEXT:    movb 30(%rcx), %bl
 ; CHECK-SSE1-NEXT:    movb 31(%rcx), %al
-; CHECK-SSE1-NEXT:    movb 31(%r14), %cl
+; CHECK-SSE1-NEXT:    movb 31(%r12), %cl
 ; CHECK-SSE1-NEXT:    andb %al, %cl
 ; CHECK-SSE1-NEXT:    notb %al
 ; CHECK-SSE1-NEXT:    andb 31(%rdx), %al
 ; CHECK-SSE1-NEXT:    orb %cl, %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 30(%r14), %al
+; CHECK-SSE1-NEXT:    movb 30(%r12), %al
 ; CHECK-SSE1-NEXT:    andb %bl, %al
 ; CHECK-SSE1-NEXT:    notb %bl
 ; CHECK-SSE1-NEXT:    andb 30(%rdx), %bl
 ; CHECK-SSE1-NEXT:    orb %al, %bl
 ; CHECK-SSE1-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 29(%r14), %al
+; CHECK-SSE1-NEXT:    movb 29(%r12), %al
 ; CHECK-SSE1-NEXT:    andb %sil, %al
 ; CHECK-SSE1-NEXT:    notb %sil
 ; CHECK-SSE1-NEXT:    andb 29(%rdx), %sil
 ; CHECK-SSE1-NEXT:    orb %al, %sil
 ; CHECK-SSE1-NEXT:    movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 28(%r14), %al
+; CHECK-SSE1-NEXT:    movb 28(%r12), %al
 ; CHECK-SSE1-NEXT:    andb %dil, %al
 ; CHECK-SSE1-NEXT:    notb %dil
 ; CHECK-SSE1-NEXT:    andb 28(%rdx), %dil
 ; CHECK-SSE1-NEXT:    orb %al, %dil
 ; CHECK-SSE1-NEXT:    movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 27(%r14), %al
+; CHECK-SSE1-NEXT:    movb 27(%r12), %al
 ; CHECK-SSE1-NEXT:    andb %r8b, %al
 ; CHECK-SSE1-NEXT:    notb %r8b
 ; CHECK-SSE1-NEXT:    andb 27(%rdx), %r8b
 ; CHECK-SSE1-NEXT:    orb %al, %r8b
 ; CHECK-SSE1-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 26(%r14), %al
+; CHECK-SSE1-NEXT:    movb 26(%r12), %al
 ; CHECK-SSE1-NEXT:    andb %r13b, %al
 ; CHECK-SSE1-NEXT:    notb %r13b
 ; CHECK-SSE1-NEXT:    andb 26(%rdx), %r13b
 ; CHECK-SSE1-NEXT:    orb %al, %r13b
 ; CHECK-SSE1-NEXT:    movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 25(%r14), %al
+; CHECK-SSE1-NEXT:    movb 25(%r12), %al
 ; CHECK-SSE1-NEXT:    andb %bpl, %al
 ; CHECK-SSE1-NEXT:    notb %bpl
 ; CHECK-SSE1-NEXT:    andb 25(%rdx), %bpl
 ; CHECK-SSE1-NEXT:    orb %al, %bpl
 ; CHECK-SSE1-NEXT:    movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 24(%r14), %al
+; CHECK-SSE1-NEXT:    movb 24(%r12), %al
 ; CHECK-SSE1-NEXT:    andb %r11b, %al
 ; CHECK-SSE1-NEXT:    notb %r11b
 ; CHECK-SSE1-NEXT:    andb 24(%rdx), %r11b
 ; CHECK-SSE1-NEXT:    orb %al, %r11b
 ; CHECK-SSE1-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 23(%r14), %al
+; CHECK-SSE1-NEXT:    movb 23(%r12), %al
 ; CHECK-SSE1-NEXT:    andb %r10b, %al
 ; CHECK-SSE1-NEXT:    notb %r10b
 ; CHECK-SSE1-NEXT:    andb 23(%rdx), %r10b
 ; CHECK-SSE1-NEXT:    orb %al, %r10b
 ; CHECK-SSE1-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 22(%r14), %al
+; CHECK-SSE1-NEXT:    movb 22(%r12), %al
 ; CHECK-SSE1-NEXT:    andb %r9b, %al
 ; CHECK-SSE1-NEXT:    notb %r9b
 ; CHECK-SSE1-NEXT:    andb 22(%rdx), %r9b
 ; CHECK-SSE1-NEXT:    orb %al, %r9b
 ; CHECK-SSE1-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 21(%r14), %al
-; CHECK-SSE1-NEXT:    andb %r12b, %al
-; CHECK-SSE1-NEXT:    notb %r12b
-; CHECK-SSE1-NEXT:    andb 21(%rdx), %r12b
-; CHECK-SSE1-NEXT:    orb %al, %r12b
-; CHECK-SSE1-NEXT:    movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 20(%r14), %al
+; CHECK-SSE1-NEXT:    movb 21(%r12), %al
+; CHECK-SSE1-NEXT:    andb %r14b, %al
+; CHECK-SSE1-NEXT:    notb %r14b
+; CHECK-SSE1-NEXT:    andb 21(%rdx), %r14b
+; CHECK-SSE1-NEXT:    orb %al, %r14b
+; CHECK-SSE1-NEXT:    movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 20(%r12), %al
 ; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
 ; CHECK-SSE1-NEXT:    andb %cl, %al
 ; CHECK-SSE1-NEXT:    notb %cl
 ; CHECK-SSE1-NEXT:    andb 20(%rdx), %cl
 ; CHECK-SSE1-NEXT:    orb %al, %cl
 ; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 19(%r14), %al
+; CHECK-SSE1-NEXT:    movb 19(%r12), %al
 ; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
 ; CHECK-SSE1-NEXT:    andb %cl, %al
 ; CHECK-SSE1-NEXT:    notb %cl
 ; CHECK-SSE1-NEXT:    andb 19(%rdx), %cl
 ; CHECK-SSE1-NEXT:    orb %al, %cl
 ; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 18(%r14), %al
+; CHECK-SSE1-NEXT:    movb 18(%r12), %al
 ; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
 ; CHECK-SSE1-NEXT:    andb %cl, %al
 ; CHECK-SSE1-NEXT:    notb %cl
 ; CHECK-SSE1-NEXT:    andb 18(%rdx), %cl
 ; CHECK-SSE1-NEXT:    orb %al, %cl
 ; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 17(%r14), %al
+; CHECK-SSE1-NEXT:    movb 17(%r12), %al
 ; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
 ; CHECK-SSE1-NEXT:    andb %cl, %al
 ; CHECK-SSE1-NEXT:    notb %cl
@@ -1734,7 +1734,7 @@ define <32 x i8> @out_v32i8(<32 x i8> *%px, <32 x i8> *%py, <32 x i8> *%pmask) n
 ; CHECK-SSE1-NEXT:    andb 17(%rdx), %cl
 ; CHECK-SSE1-NEXT:    orb %al, %cl
 ; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 16(%r14), %al
+; CHECK-SSE1-NEXT:    movb 16(%r12), %al
 ; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
 ; CHECK-SSE1-NEXT:    andb %cl, %al
 ; CHECK-SSE1-NEXT:    notb %cl
@@ -1742,105 +1742,105 @@ define <32 x i8> @out_v32i8(<32 x i8> *%px, <32 x i8> *%py, <32 x i8> *%pmask) n
 ; CHECK-SSE1-NEXT:    orb %al, %cl
 ; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-SSE1-NEXT:    movb 15(%r15), %cl
-; CHECK-SSE1-NEXT:    movb 15(%r14), %al
+; CHECK-SSE1-NEXT:    movb 15(%r12), %al
 ; CHECK-SSE1-NEXT:    andb %cl, %al
 ; CHECK-SSE1-NEXT:    notb %cl
 ; CHECK-SSE1-NEXT:    andb 15(%rdx), %cl
 ; CHECK-SSE1-NEXT:    orb %al, %cl
 ; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-SSE1-NEXT:    movb 14(%r15), %cl
-; CHECK-SSE1-NEXT:    movb 14(%r14), %al
+; CHECK-SSE1-NEXT:    movb 14(%r12), %al
 ; CHECK-SSE1-NEXT:    andb %cl, %al
 ; CHECK-SSE1-NEXT:    notb %cl
 ; CHECK-SSE1-NEXT:    andb 14(%rdx), %cl
 ; CHECK-SSE1-NEXT:    orb %al, %cl
 ; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-SSE1-NEXT:    movb 13(%r15), %cl
-; CHECK-SSE1-NEXT:    movb 13(%r14), %al
+; CHECK-SSE1-NEXT:    movb 13(%r12), %al
 ; CHECK-SSE1-NEXT:    andb %cl, %al
 ; CHECK-SSE1-NEXT:    notb %cl
 ; CHECK-SSE1-NEXT:    andb 13(%rdx), %cl
 ; CHECK-SSE1-NEXT:    orb %al, %cl
 ; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-SSE1-NEXT:    movb 12(%r15), %cl
-; CHECK-SSE1-NEXT:    movb 12(%r14), %al
+; CHECK-SSE1-NEXT:    movb 12(%r12), %al
 ; CHECK-SSE1-NEXT:    andb %cl, %al
 ; CHECK-SSE1-NEXT:    notb %cl
 ; CHECK-SSE1-NEXT:    andb 12(%rdx), %cl
 ; CHECK-SSE1-NEXT:    orb %al, %cl
 ; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-SSE1-NEXT:    movb 11(%r15), %r13b
-; CHECK-SSE1-NEXT:    movb 11(%r14), %al
+; CHECK-SSE1-NEXT:    movb 11(%r12), %al
 ; CHECK-SSE1-NEXT:    andb %r13b, %al
 ; CHECK-SSE1-NEXT:    notb %r13b
 ; CHECK-SSE1-NEXT:    andb 11(%rdx), %r13b
 ; CHECK-SSE1-NEXT:    orb %al, %r13b
-; CHECK-SSE1-NEXT:    movb 10(%r15), %r12b
-; CHECK-SSE1-NEXT:    movb 10(%r14), %al
-; CHECK-SSE1-NEXT:    andb %r12b, %al
-; CHECK-SSE1-NEXT:    notb %r12b
-; CHECK-SSE1-NEXT:    andb 10(%rdx), %r12b
-; CHECK-SSE1-NEXT:    orb %al, %r12b
+; CHECK-SSE1-NEXT:    movb 10(%r15), %r14b
+; CHECK-SSE1-NEXT:    movb 10(%r12), %al
+; CHECK-SSE1-NEXT:    andb %r14b, %al
+; CHECK-SSE1-NEXT:    notb %r14b
+; CHECK-SSE1-NEXT:    andb 10(%rdx), %r14b
+; CHECK-SSE1-NEXT:    orb %al, %r14b
 ; CHECK-SSE1-NEXT:    movb 9(%r15), %bpl
-; CHECK-SSE1-NEXT:    movb 9(%r14), %al
+; CHECK-SSE1-NEXT:    movb 9(%r12), %al
 ; CHECK-SSE1-NEXT:    andb %bpl, %al
 ; CHECK-SSE1-NEXT:    notb %bpl
 ; CHECK-SSE1-NEXT:    andb 9(%rdx), %bpl
 ; CHECK-SSE1-NEXT:    orb %al, %bpl
 ; CHECK-SSE1-NEXT:    movb 8(%r15), %r11b
-; CHECK-SSE1-NEXT:    movb 8(%r14), %al
+; CHECK-SSE1-NEXT:    movb 8(%r12), %al
 ; CHECK-SSE1-NEXT:    andb %r11b, %al
 ; CHECK-SSE1-NEXT:    notb %r11b
 ; CHECK-SSE1-NEXT:    andb 8(%rdx), %r11b
 ; CHECK-SSE1-NEXT:    orb %al, %r11b
 ; CHECK-SSE1-NEXT:    movb 7(%r15), %r10b
-; CHECK-SSE1-NEXT:    movb 7(%r14), %al
+; CHECK-SSE1-NEXT:    movb 7(%r12), %al
 ; CHECK-SSE1-NEXT:    andb %r10b, %al
 ; CHECK-SSE1-NEXT:    notb %r10b
 ; CHECK-SSE1-NEXT:    andb 7(%rdx), %r10b
 ; CHECK-SSE1-NEXT:    orb %al, %r10b
 ; CHECK-SSE1-NEXT:    movb 6(%r15), %r9b
-; CHECK-SSE1-NEXT:    movb 6(%r14), %al
+; CHECK-SSE1-NEXT:    movb 6(%r12), %al
 ; CHECK-SSE1-NEXT:    andb %r9b, %al
 ; CHECK-SSE1-NEXT:    notb %r9b
 ; CHECK-SSE1-NEXT:    andb 6(%rdx), %r9b
 ; CHECK-SSE1-NEXT:    orb %al, %r9b
 ; CHECK-SSE1-NEXT:    movb 5(%r15), %r8b
-; CHECK-SSE1-NEXT:    movb 5(%r14), %al
+; CHECK-SSE1-NEXT:    movb 5(%r12), %al
 ; CHECK-SSE1-NEXT:    andb %r8b, %al
 ; CHECK-SSE1-NEXT:    notb %r8b
 ; CHECK-SSE1-NEXT:    andb 5(%rdx), %r8b
 ; CHECK-SSE1-NEXT:    orb %al, %r8b
 ; CHECK-SSE1-NEXT:    movb 4(%r15), %dil
-; CHECK-SSE1-NEXT:    movb 4(%r14), %al
+; CHECK-SSE1-NEXT:    movb 4(%r12), %al
 ; CHECK-SSE1-NEXT:    andb %dil, %al
 ; CHECK-SSE1-NEXT:    notb %dil
 ; CHECK-SSE1-NEXT:    andb 4(%rdx), %dil
 ; CHECK-SSE1-NEXT:    orb %al, %dil
 ; CHECK-SSE1-NEXT:    movb 3(%r15), %sil
-; CHECK-SSE1-NEXT:    movb 3(%r14), %al
+; CHECK-SSE1-NEXT:    movb 3(%r12), %al
 ; CHECK-SSE1-NEXT:    andb %sil, %al
 ; CHECK-SSE1-NEXT:    notb %sil
 ; CHECK-SSE1-NEXT:    andb 3(%rdx), %sil
 ; CHECK-SSE1-NEXT:    orb %al, %sil
 ; CHECK-SSE1-NEXT:    movb 2(%r15), %dl
-; CHECK-SSE1-NEXT:    movb 2(%r14), %al
+; CHECK-SSE1-NEXT:    movb 2(%r12), %al
 ; CHECK-SSE1-NEXT:    andb %dl, %al
 ; CHECK-SSE1-NEXT:    notb %dl
 ; CHECK-SSE1-NEXT:    andb 2(%rbx), %dl
 ; CHECK-SSE1-NEXT:    orb %al, %dl
 ; CHECK-SSE1-NEXT:    movb 1(%r15), %al
-; CHECK-SSE1-NEXT:    movb 1(%r14), %cl
+; CHECK-SSE1-NEXT:    movb 1(%r12), %cl
 ; CHECK-SSE1-NEXT:    andb %al, %cl
 ; CHECK-SSE1-NEXT:    notb %al
 ; CHECK-SSE1-NEXT:    andb 1(%rbx), %al
 ; CHECK-SSE1-NEXT:    orb %cl, %al
 ; CHECK-SSE1-NEXT:    movb (%r15), %r15b
-; CHECK-SSE1-NEXT:    movb (%r14), %r14b
-; CHECK-SSE1-NEXT:    andb %r15b, %r14b
+; CHECK-SSE1-NEXT:    movb (%r12), %cl
+; CHECK-SSE1-NEXT:    andb %r15b, %cl
 ; CHECK-SSE1-NEXT:    notb %r15b
 ; CHECK-SSE1-NEXT:    andb (%rbx), %r15b
-; CHECK-SSE1-NEXT:    orb %r14b, %r15b
+; CHECK-SSE1-NEXT:    orb %cl, %r15b
 ; CHECK-SSE1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
 ; CHECK-SSE1-NEXT:    movb %bl, 31(%rcx)
@@ -1883,7 +1883,7 @@ define <32 x i8> @out_v32i8(<32 x i8> *%px, <32 x i8> *%py, <32 x i8> *%pmask) n
 ; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
 ; CHECK-SSE1-NEXT:    movb %bl, 12(%rcx)
 ; CHECK-SSE1-NEXT:    movb %r13b, 11(%rcx)
-; CHECK-SSE1-NEXT:    movb %r12b, 10(%rcx)
+; CHECK-SSE1-NEXT:    movb %r14b, 10(%rcx)
 ; CHECK-SSE1-NEXT:    movb %bpl, 9(%rcx)
 ; CHECK-SSE1-NEXT:    movb %r11b, 8(%rcx)
 ; CHECK-SSE1-NEXT:    movb %r10b, 7(%rcx)
@@ -2751,46 +2751,45 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-BASELINE-NEXT:    pushq %r13
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
-; CHECK-BASELINE-NEXT:    movl %ecx, %r10d
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
-; CHECK-BASELINE-NEXT:    xorb %r13b, %sil
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-BASELINE-NEXT:    xorb %r11b, %sil
 ; CHECK-BASELINE-NEXT:    xorb %r12b, %dl
-; CHECK-BASELINE-NEXT:    xorb %r15b, %r10b
+; CHECK-BASELINE-NEXT:    xorb %r15b, %cl
 ; CHECK-BASELINE-NEXT:    xorb %r14b, %r8b
 ; CHECK-BASELINE-NEXT:    xorb %bpl, %r9b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
+; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %r13b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bl
 ; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %bl
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %cl
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; CHECK-BASELINE-NEXT:    xorb %r11b, %al
+; CHECK-BASELINE-NEXT:    xorb %r10b, %al
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
-; CHECK-BASELINE-NEXT:    xorb %r13b, %sil
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
+; CHECK-BASELINE-NEXT:    xorb %r11b, %sil
 ; CHECK-BASELINE-NEXT:    xorb %r12b, %dl
-; CHECK-BASELINE-NEXT:    xorb %r15b, %r10b
+; CHECK-BASELINE-NEXT:    xorb %r15b, %cl
 ; CHECK-BASELINE-NEXT:    xorb %r14b, %r8b
 ; CHECK-BASELINE-NEXT:    xorb %bpl, %r9b
+; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %r13b
 ; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %bl
-; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    xorb %r11b, %al
+; CHECK-BASELINE-NEXT:    xorb %r10b, %al
 ; CHECK-BASELINE-NEXT:    movb %al, 7(%rdi)
-; CHECK-BASELINE-NEXT:    movb %cl, 6(%rdi)
-; CHECK-BASELINE-NEXT:    movb %bl, 5(%rdi)
+; CHECK-BASELINE-NEXT:    movb %bl, 6(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r13b, 5(%rdi)
 ; CHECK-BASELINE-NEXT:    movb %r9b, 4(%rdi)
 ; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r10b, 2(%rdi)
+; CHECK-BASELINE-NEXT:    movb %cl, 2(%rdi)
 ; CHECK-BASELINE-NEXT:    movb %dl, 1(%rdi)
 ; CHECK-BASELINE-NEXT:    movb %sil, (%rdi)
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
@@ -2810,46 +2809,45 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-SSE1-NEXT:    pushq %r13
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
-; CHECK-SSE1-NEXT:    movl %ecx, %r10d
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
-; CHECK-SSE1-NEXT:    xorb %r13b, %sil
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-SSE1-NEXT:    xorb %r11b, %sil
 ; CHECK-SSE1-NEXT:    xorb %r12b, %dl
-; CHECK-SSE1-NEXT:    xorb %r15b, %r10b
+; CHECK-SSE1-NEXT:    xorb %r15b, %cl
 ; CHECK-SSE1-NEXT:    xorb %r14b, %r8b
 ; CHECK-SSE1-NEXT:    xorb %bpl, %r9b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
+; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %r13b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bl
 ; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %bl
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %cl
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; CHECK-SSE1-NEXT:    xorb %r11b, %al
+; CHECK-SSE1-NEXT:    xorb %r10b, %al
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
-; CHECK-SSE1-NEXT:    xorb %r13b, %sil
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
+; CHECK-SSE1-NEXT:    xorb %r11b, %sil
 ; CHECK-SSE1-NEXT:    xorb %r12b, %dl
-; CHECK-SSE1-NEXT:    xorb %r15b, %r10b
+; CHECK-SSE1-NEXT:    xorb %r15b, %cl
 ; CHECK-SSE1-NEXT:    xorb %r14b, %r8b
 ; CHECK-SSE1-NEXT:    xorb %bpl, %r9b
+; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %r13b
 ; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %bl
-; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    xorb %r11b, %al
+; CHECK-SSE1-NEXT:    xorb %r10b, %al
 ; CHECK-SSE1-NEXT:    movb %al, 7(%rdi)
-; CHECK-SSE1-NEXT:    movb %cl, 6(%rdi)
-; CHECK-SSE1-NEXT:    movb %bl, 5(%rdi)
+; CHECK-SSE1-NEXT:    movb %bl, 6(%rdi)
+; CHECK-SSE1-NEXT:    movb %r13b, 5(%rdi)
 ; CHECK-SSE1-NEXT:    movb %r9b, 4(%rdi)
 ; CHECK-SSE1-NEXT:    movb %r8b, 3(%rdi)
-; CHECK-SSE1-NEXT:    movb %r10b, 2(%rdi)
+; CHECK-SSE1-NEXT:    movb %cl, 2(%rdi)
 ; CHECK-SSE1-NEXT:    movb %dl, 1(%rdi)
 ; CHECK-SSE1-NEXT:    movb %sil, (%rdi)
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
@@ -4387,7 +4385,6 @@ define <8 x i32> @in_v8i32(<8 x i32> *%px, <8 x i32> *%py, <8 x i32> *%pmask) no
 ; CHECK-BASELINE-NEXT:    movl 28(%rdx), %r15d
 ; CHECK-BASELINE-NEXT:    movl 24(%rdx), %r14d
 ; CHECK-BASELINE-NEXT:    movl 20(%rdx), %r10d
-; CHECK-BASELINE-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-BASELINE-NEXT:    movl 16(%rdx), %eax
 ; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-BASELINE-NEXT:    movl 12(%rdx), %ebp
@@ -4425,7 +4422,7 @@ define <8 x i32> @in_v8i32(<8 x i32> *%px, <8 x i32> *%py, <8 x i32> *%pmask) no
 ; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload
-; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl %r10d, %edx
 ; CHECK-BASELINE-NEXT:    xorl %r14d, %eax
 ; CHECK-BASELINE-NEXT:    xorl %r15d, %esi
 ; CHECK-BASELINE-NEXT:    movl %esi, 28(%rdi)
@@ -4456,7 +4453,6 @@ define <8 x i32> @in_v8i32(<8 x i32> *%px, <8 x i32> *%py, <8 x i32> *%pmask) no
 ; CHECK-SSE1-NEXT:    movl 28(%rdx), %r15d
 ; CHECK-SSE1-NEXT:    movl 24(%rdx), %r14d
 ; CHECK-SSE1-NEXT:    movl 20(%rdx), %r10d
-; CHECK-SSE1-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-SSE1-NEXT:    movl 16(%rdx), %eax
 ; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-SSE1-NEXT:    movl 12(%rdx), %ebp
@@ -4494,7 +4490,7 @@ define <8 x i32> @in_v8i32(<8 x i32> *%px, <8 x i32> *%py, <8 x i32> *%pmask) no
 ; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Folded Reload
 ; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload
-; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl %r10d, %edx
 ; CHECK-SSE1-NEXT:    xorl %r14d, %eax
 ; CHECK-SSE1-NEXT:    xorl %r15d, %esi
 ; CHECK-SSE1-NEXT:    movl %esi, 28(%rdi)

diff  --git a/llvm/test/CodeGen/X86/ushl_sat.ll b/llvm/test/CodeGen/X86/ushl_sat.ll
index f298958c5be39..a3c4dd6e46ad9 100644
--- a/llvm/test/CodeGen/X86/ushl_sat.ll
+++ b/llvm/test/CodeGen/X86/ushl_sat.ll
@@ -202,30 +202,30 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    shldl %cl, %edi, %edx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    shldl %cl, %esi, %edx
 ; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    cmovnel %ebx, %esi
+; X86-NEXT:    cmovnel %edi, %edx
+; X86-NEXT:    cmovnel %ebx, %edi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    shrl %cl, %ebp
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    cmovel %ebp, %ebx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    shrdl %cl, %edx, %eax
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    cmovnel %ebp, %eax
-; X86-NEXT:    xorl %edi, %eax
+; X86-NEXT:    xorl %esi, %eax
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    orl %eax, %ebx
 ; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    cmovnel %eax, %esi
+; X86-NEXT:    cmovnel %eax, %edi
 ; X86-NEXT:    cmovnel %eax, %edx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
index 821fc9a9c25e3..5904892e7f240 100644
--- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
@@ -66,45 +66,46 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    shll %cl, %ebp
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    movl $-1, %edx
-; X86-NEXT:    cmovnel %edx, %ebp
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    movb %ah, %cl
-; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    cmpl %esi, %edi
-; X86-NEXT:    cmovnel %edx, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    shrl %cl, %ebp
+; X86-NEXT:    cmpl %ebp, %ebx
+; X86-NEXT:    movl $-1, %ebx
+; X86-NEXT:    cmovnel %ebx, %esi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movb %dl, %cl
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    shrl %cl, %ebp
+; X86-NEXT:    cmpl %ebp, %edi
+; X86-NEXT:    cmovnel %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %edx
+; X86-NEXT:    movb %ch, %cl
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    shrl %cl, %edi
+; X86-NEXT:    cmpl %edi, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmovnel %ebx, %edx
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    shll %cl, %ebp
+; X86-NEXT:    movl %ebp, %edi
 ; X86-NEXT:    shrl %cl, %edi
 ; X86-NEXT:    cmpl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovnel %edx, %esi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovnel %edx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    movl %ebp, (%eax)
+; X86-NEXT:    cmovnel %ebx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ebp, 12(%ecx)
+; X86-NEXT:    movl %edx, 8(%ecx)
+; X86-NEXT:    movl %eax, 4(%ecx)
+; X86-NEXT:    movl %esi, (%ecx)
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/usub_sat.ll b/llvm/test/CodeGen/X86/usub_sat.ll
index a597b3b1ac009..8ac20843259a2 100644
--- a/llvm/test/CodeGen/X86/usub_sat.ll
+++ b/llvm/test/CodeGen/X86/usub_sat.ll
@@ -124,23 +124,23 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovbl %ebx, %edi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl %ebx, %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmovbl %ebx, %edx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cmovbl %ebx, %ecx
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmovbl %ebx, %edx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmovbl %ebx, %esi
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmovbl %ebx, %edi
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll
index 85b8930f2acaa..3b4777664b666 100644
--- a/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll
@@ -118,30 +118,30 @@ define <4 x i32> @test_v4f32_ogt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmoval %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
-; SSE-32-NEXT:    movaps %xmm3, %xmm5
-; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
+; SSE-32-NEXT:    movd %edx, %xmm5
+; SSE-32-NEXT:    movaps %xmm3, %xmm4
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
 ; SSE-32-NEXT:    movaps %xmm2, %xmm6
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1]
-; SSE-32-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-32-NEXT:    ucomiss %xmm4, %xmm6
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmoval %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm5
-; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
 ; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmoval %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,1,1]
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
 ; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
 ; SSE-32-NEXT:    cmoval %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
-; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; SSE-32-NEXT:    pand %xmm4, %xmm0
-; SSE-32-NEXT:    pandn %xmm1, %xmm4
-; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; SSE-32-NEXT:    pand %xmm5, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm5
+; SSE-32-NEXT:    por %xmm5, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -272,30 +272,30 @@ define <4 x i32> @test_v4f32_oge_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovael %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
-; SSE-32-NEXT:    movaps %xmm3, %xmm5
-; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
+; SSE-32-NEXT:    movd %edx, %xmm5
+; SSE-32-NEXT:    movaps %xmm3, %xmm4
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
 ; SSE-32-NEXT:    movaps %xmm2, %xmm6
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1]
-; SSE-32-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-32-NEXT:    ucomiss %xmm4, %xmm6
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovael %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm5
-; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
 ; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovael %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,1,1]
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
 ; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
 ; SSE-32-NEXT:    cmovael %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
-; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; SSE-32-NEXT:    pand %xmm4, %xmm0
-; SSE-32-NEXT:    pandn %xmm1, %xmm4
-; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; SSE-32-NEXT:    pand %xmm5, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm5
+; SSE-32-NEXT:    por %xmm5, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -426,30 +426,30 @@ define <4 x i32> @test_v4f32_olt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmoval %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
-; SSE-32-NEXT:    movaps %xmm2, %xmm5
-; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1]
+; SSE-32-NEXT:    movd %edx, %xmm5
+; SSE-32-NEXT:    movaps %xmm2, %xmm4
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
 ; SSE-32-NEXT:    movaps %xmm3, %xmm6
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
-; SSE-32-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-32-NEXT:    ucomiss %xmm4, %xmm6
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmoval %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm5
-; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
 ; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmoval %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,1,1]
 ; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
 ; SSE-32-NEXT:    cmoval %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
-; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; SSE-32-NEXT:    pand %xmm4, %xmm0
-; SSE-32-NEXT:    pandn %xmm1, %xmm4
-; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; SSE-32-NEXT:    pand %xmm5, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm5
+; SSE-32-NEXT:    por %xmm5, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -578,30 +578,30 @@ define <4 x i32> @test_v4f32_ole_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovael %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
-; SSE-32-NEXT:    movaps %xmm2, %xmm5
-; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1]
+; SSE-32-NEXT:    movd %edx, %xmm5
+; SSE-32-NEXT:    movaps %xmm2, %xmm4
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
 ; SSE-32-NEXT:    movaps %xmm3, %xmm6
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
-; SSE-32-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-32-NEXT:    ucomiss %xmm4, %xmm6
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovael %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm5
-; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
 ; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovael %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,1,1]
 ; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
 ; SSE-32-NEXT:    cmovael %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
-; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; SSE-32-NEXT:    pand %xmm4, %xmm0
-; SSE-32-NEXT:    pandn %xmm1, %xmm4
-; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; SSE-32-NEXT:    pand %xmm5, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm5
+; SSE-32-NEXT:    por %xmm5, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -1023,30 +1023,30 @@ define <4 x i32> @test_v4f32_ugt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovbl %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
-; SSE-32-NEXT:    movaps %xmm2, %xmm5
-; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1]
+; SSE-32-NEXT:    movd %edx, %xmm5
+; SSE-32-NEXT:    movaps %xmm2, %xmm4
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
 ; SSE-32-NEXT:    movaps %xmm3, %xmm6
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
-; SSE-32-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-32-NEXT:    ucomiss %xmm4, %xmm6
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovbl %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm5
-; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
 ; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovbl %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,1,1]
 ; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
 ; SSE-32-NEXT:    cmovbl %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
-; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; SSE-32-NEXT:    pand %xmm4, %xmm0
-; SSE-32-NEXT:    pandn %xmm1, %xmm4
-; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; SSE-32-NEXT:    pand %xmm5, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm5
+; SSE-32-NEXT:    por %xmm5, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -1175,30 +1175,30 @@ define <4 x i32> @test_v4f32_uge_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovbel %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
-; SSE-32-NEXT:    movaps %xmm2, %xmm5
-; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1]
+; SSE-32-NEXT:    movd %edx, %xmm5
+; SSE-32-NEXT:    movaps %xmm2, %xmm4
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
 ; SSE-32-NEXT:    movaps %xmm3, %xmm6
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
-; SSE-32-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-32-NEXT:    ucomiss %xmm4, %xmm6
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovbel %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm5
-; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
 ; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovbel %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,1,1]
 ; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
 ; SSE-32-NEXT:    cmovbel %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
-; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; SSE-32-NEXT:    pand %xmm4, %xmm0
-; SSE-32-NEXT:    pandn %xmm1, %xmm4
-; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; SSE-32-NEXT:    pand %xmm5, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm5
+; SSE-32-NEXT:    por %xmm5, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -1327,30 +1327,30 @@ define <4 x i32> @test_v4f32_ult_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovbl %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
-; SSE-32-NEXT:    movaps %xmm3, %xmm5
-; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
+; SSE-32-NEXT:    movd %edx, %xmm5
+; SSE-32-NEXT:    movaps %xmm3, %xmm4
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
 ; SSE-32-NEXT:    movaps %xmm2, %xmm6
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1]
-; SSE-32-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-32-NEXT:    ucomiss %xmm4, %xmm6
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovbl %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm5
-; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
 ; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovbl %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,1,1]
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
 ; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
 ; SSE-32-NEXT:    cmovbl %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
-; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; SSE-32-NEXT:    pand %xmm4, %xmm0
-; SSE-32-NEXT:    pandn %xmm1, %xmm4
-; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; SSE-32-NEXT:    pand %xmm5, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm5
+; SSE-32-NEXT:    por %xmm5, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -1481,30 +1481,30 @@ define <4 x i32> @test_v4f32_ule_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovbel %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
-; SSE-32-NEXT:    movaps %xmm3, %xmm5
-; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
+; SSE-32-NEXT:    movd %edx, %xmm5
+; SSE-32-NEXT:    movaps %xmm3, %xmm4
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
 ; SSE-32-NEXT:    movaps %xmm2, %xmm6
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1]
-; SSE-32-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-32-NEXT:    ucomiss %xmm4, %xmm6
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovbel %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm5
-; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
 ; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovbel %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,1,1]
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
 ; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
 ; SSE-32-NEXT:    cmovbel %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
-; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; SSE-32-NEXT:    pand %xmm4, %xmm0
-; SSE-32-NEXT:    pandn %xmm1, %xmm4
-; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; SSE-32-NEXT:    pand %xmm5, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm5
+; SSE-32-NEXT:    por %xmm5, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -1904,24 +1904,24 @@ define <2 x i64> @test_v2f64_ogt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    movl %esp, %ebp
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
-; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm4
 ; SSE-32-NEXT:    xorl %eax, %eax
-; SSE-32-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-32-NEXT:    ucomisd %xmm4, %xmm2
 ; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmoval %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
-; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-32-NEXT:    movd %edx, %xmm3
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE-32-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-32-NEXT:    ucomisd %xmm4, %xmm2
 ; SSE-32-NEXT:    cmoval %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
-; SSE-32-NEXT:    pand %xmm4, %xmm0
-; SSE-32-NEXT:    pandn %xmm1, %xmm4
-; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; SSE-32-NEXT:    pand %xmm3, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm3
+; SSE-32-NEXT:    por %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -2024,24 +2024,24 @@ define <2 x i64> @test_v2f64_oge_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    movl %esp, %ebp
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
-; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm4
 ; SSE-32-NEXT:    xorl %eax, %eax
-; SSE-32-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-32-NEXT:    ucomisd %xmm4, %xmm2
 ; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovael %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
-; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-32-NEXT:    movd %edx, %xmm3
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE-32-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-32-NEXT:    ucomisd %xmm4, %xmm2
 ; SSE-32-NEXT:    cmovael %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
-; SSE-32-NEXT:    pand %xmm4, %xmm0
-; SSE-32-NEXT:    pandn %xmm1, %xmm4
-; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; SSE-32-NEXT:    pand %xmm3, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm3
+; SSE-32-NEXT:    por %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -2144,24 +2144,24 @@ define <2 x i64> @test_v2f64_olt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    movl %esp, %ebp
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
-; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm4
 ; SSE-32-NEXT:    xorl %eax, %eax
-; SSE-32-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-32-NEXT:    ucomisd %xmm2, %xmm4
 ; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmoval %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
+; SSE-32-NEXT:    movd %edx, %xmm3
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
-; SSE-32-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
+; SSE-32-NEXT:    ucomisd %xmm2, %xmm4
 ; SSE-32-NEXT:    cmoval %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
-; SSE-32-NEXT:    pand %xmm4, %xmm0
-; SSE-32-NEXT:    pandn %xmm1, %xmm4
-; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; SSE-32-NEXT:    pand %xmm3, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm3
+; SSE-32-NEXT:    por %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -2262,24 +2262,24 @@ define <2 x i64> @test_v2f64_ole_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    movl %esp, %ebp
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
-; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm4
 ; SSE-32-NEXT:    xorl %eax, %eax
-; SSE-32-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-32-NEXT:    ucomisd %xmm2, %xmm4
 ; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovael %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
+; SSE-32-NEXT:    movd %edx, %xmm3
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
-; SSE-32-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
+; SSE-32-NEXT:    ucomisd %xmm2, %xmm4
 ; SSE-32-NEXT:    cmovael %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
-; SSE-32-NEXT:    pand %xmm4, %xmm0
-; SSE-32-NEXT:    pandn %xmm1, %xmm4
-; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; SSE-32-NEXT:    pand %xmm3, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm3
+; SSE-32-NEXT:    por %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -2673,24 +2673,24 @@ define <2 x i64> @test_v2f64_ugt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    movl %esp, %ebp
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
-; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm4
 ; SSE-32-NEXT:    xorl %eax, %eax
-; SSE-32-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-32-NEXT:    ucomisd %xmm2, %xmm4
 ; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovbl %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
+; SSE-32-NEXT:    movd %edx, %xmm3
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
-; SSE-32-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
+; SSE-32-NEXT:    ucomisd %xmm2, %xmm4
 ; SSE-32-NEXT:    cmovbl %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
-; SSE-32-NEXT:    pand %xmm4, %xmm0
-; SSE-32-NEXT:    pandn %xmm1, %xmm4
-; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; SSE-32-NEXT:    pand %xmm3, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm3
+; SSE-32-NEXT:    por %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -2791,24 +2791,24 @@ define <2 x i64> @test_v2f64_uge_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    movl %esp, %ebp
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
-; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm4
 ; SSE-32-NEXT:    xorl %eax, %eax
-; SSE-32-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-32-NEXT:    ucomisd %xmm2, %xmm4
 ; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovbel %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
+; SSE-32-NEXT:    movd %edx, %xmm3
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
-; SSE-32-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
+; SSE-32-NEXT:    ucomisd %xmm2, %xmm4
 ; SSE-32-NEXT:    cmovbel %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
-; SSE-32-NEXT:    pand %xmm4, %xmm0
-; SSE-32-NEXT:    pandn %xmm1, %xmm4
-; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; SSE-32-NEXT:    pand %xmm3, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm3
+; SSE-32-NEXT:    por %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -2909,24 +2909,24 @@ define <2 x i64> @test_v2f64_ult_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    movl %esp, %ebp
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
-; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm4
 ; SSE-32-NEXT:    xorl %eax, %eax
-; SSE-32-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-32-NEXT:    ucomisd %xmm4, %xmm2
 ; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovbl %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
-; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-32-NEXT:    movd %edx, %xmm3
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE-32-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-32-NEXT:    ucomisd %xmm4, %xmm2
 ; SSE-32-NEXT:    cmovbl %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
-; SSE-32-NEXT:    pand %xmm4, %xmm0
-; SSE-32-NEXT:    pandn %xmm1, %xmm4
-; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; SSE-32-NEXT:    pand %xmm3, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm3
+; SSE-32-NEXT:    por %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -3029,24 +3029,24 @@ define <2 x i64> @test_v2f64_ule_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    movl %esp, %ebp
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
-; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm4
 ; SSE-32-NEXT:    xorl %eax, %eax
-; SSE-32-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-32-NEXT:    ucomisd %xmm4, %xmm2
 ; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovbel %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
-; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-32-NEXT:    movd %edx, %xmm3
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE-32-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-32-NEXT:    ucomisd %xmm4, %xmm2
 ; SSE-32-NEXT:    cmovbel %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
-; SSE-32-NEXT:    pand %xmm4, %xmm0
-; SSE-32-NEXT:    pandn %xmm1, %xmm4
-; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; SSE-32-NEXT:    pand %xmm3, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm3
+; SSE-32-NEXT:    por %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll
index 83606a4bdff9b..1992c9dd2195b 100644
--- a/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll
@@ -15,22 +15,22 @@ define <2 x i32> @test_v2f32_ogt_s(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
 ; SSE-32-NEXT:    movl %esp, %ebp
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
-; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm4
 ; SSE-32-NEXT:    xorl %eax, %eax
-; SSE-32-NEXT:    comiss %xmm3, %xmm2
+; SSE-32-NEXT:    comiss %xmm4, %xmm2
 ; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmoval %ecx, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
-; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,1,1]
+; SSE-32-NEXT:    movd %edx, %xmm3
+; SSE-32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1,1,1]
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; SSE-32-NEXT:    comiss %xmm3, %xmm2
+; SSE-32-NEXT:    comiss %xmm4, %xmm2
 ; SSE-32-NEXT:    cmoval %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
-; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-32-NEXT:    pand %xmm4, %xmm0
-; SSE-32-NEXT:    pandn %xmm1, %xmm4
-; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-32-NEXT:    pand %xmm3, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm3
+; SSE-32-NEXT:    por %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -198,24 +198,24 @@ define <2 x i32> @test_v2f32_oeq_q(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
 ; SSE-32-NEXT:    movl %esp, %ebp
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
-; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm4
 ; SSE-32-NEXT:    xorl %eax, %eax
-; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
+; SSE-32-NEXT:    ucomiss %xmm4, %xmm2
 ; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $-1, %edx
 ; SSE-32-NEXT:    cmovnel %eax, %edx
 ; SSE-32-NEXT:    cmovpl %eax, %edx
-; SSE-32-NEXT:    movd %edx, %xmm4
-; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,1,1]
+; SSE-32-NEXT:    movd %edx, %xmm3
+; SSE-32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1,1,1]
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
+; SSE-32-NEXT:    ucomiss %xmm4, %xmm2
 ; SSE-32-NEXT:    cmovnel %eax, %ecx
 ; SSE-32-NEXT:    cmovpl %eax, %ecx
 ; SSE-32-NEXT:    movd %ecx, %xmm2
-; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-32-NEXT:    pand %xmm4, %xmm0
-; SSE-32-NEXT:    pandn %xmm1, %xmm4
-; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-32-NEXT:    pand %xmm3, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm3
+; SSE-32-NEXT:    por %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
index bcc14ec38e271..c7185597754ae 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
@@ -377,36 +377,36 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512F-32-NEXT:    andl $-8, %esp
 ; AVX512F-32-NEXT:    subl $40, %esp
 ; AVX512F-32-NEXT:    .cfi_offset %ebx, -12
-; AVX512F-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512F-32-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512F-32-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX512F-32-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX512F-32-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX512F-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX512F-32-NEXT:    xorl %eax, %eax
-; AVX512F-32-NEXT:    vcomisd %xmm3, %xmm2
+; AVX512F-32-NEXT:    vcomisd %xmm1, %xmm3
 ; AVX512F-32-NEXT:    setae %al
 ; AVX512F-32-NEXT:    kmovw %eax, %k1
-; AVX512F-32-NEXT:    vmovsd %xmm3, %xmm3, %xmm4 {%k1} {z}
-; AVX512F-32-NEXT:    vsubsd %xmm4, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vmovsd %xmm2, (%esp)
+; AVX512F-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z}
+; AVX512F-32-NEXT:    vsubsd %xmm4, %xmm3, %xmm3
+; AVX512F-32-NEXT:    vmovsd %xmm3, (%esp)
 ; AVX512F-32-NEXT:    xorl %edx, %edx
-; AVX512F-32-NEXT:    vcomisd %xmm3, %xmm1
+; AVX512F-32-NEXT:    vcomisd %xmm1, %xmm2
 ; AVX512F-32-NEXT:    setae %dl
 ; AVX512F-32-NEXT:    kmovw %edx, %k1
-; AVX512F-32-NEXT:    vmovsd %xmm3, %xmm3, %xmm2 {%k1} {z}
-; AVX512F-32-NEXT:    vsubsd %xmm2, %xmm1, %xmm1
-; AVX512F-32-NEXT:    vmovsd %xmm1, {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512F-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512F-32-NEXT:    vsubsd %xmm3, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vmovsd %xmm2, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
 ; AVX512F-32-NEXT:    xorl %ecx, %ecx
-; AVX512F-32-NEXT:    vcomisd %xmm3, %xmm1
+; AVX512F-32-NEXT:    vcomisd %xmm1, %xmm2
 ; AVX512F-32-NEXT:    setae %cl
 ; AVX512F-32-NEXT:    kmovw %ecx, %k1
-; AVX512F-32-NEXT:    vmovsd %xmm3, %xmm3, %xmm2 {%k1} {z}
-; AVX512F-32-NEXT:    vsubsd %xmm2, %xmm1, %xmm1
-; AVX512F-32-NEXT:    vmovsd %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512F-32-NEXT:    vsubsd %xmm3, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vmovsd %xmm2, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    xorl %ebx, %ebx
-; AVX512F-32-NEXT:    vcomisd %xmm3, %xmm0
+; AVX512F-32-NEXT:    vcomisd %xmm1, %xmm0
 ; AVX512F-32-NEXT:    setae %bl
 ; AVX512F-32-NEXT:    kmovw %ebx, %k1
-; AVX512F-32-NEXT:    vmovsd %xmm3, %xmm3, %xmm1 {%k1} {z}
+; AVX512F-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm1 {%k1} {z}
 ; AVX512F-32-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 ; AVX512F-32-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    fldl (%esp)
@@ -470,36 +470,36 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    andl $-8, %esp
 ; AVX512VL-32-NEXT:    subl $40, %esp
 ; AVX512VL-32-NEXT:    .cfi_offset %ebx, -12
-; AVX512VL-32-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512VL-32-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX512VL-32-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX512VL-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX512VL-32-NEXT:    xorl %eax, %eax
-; AVX512VL-32-NEXT:    vcomisd %xmm3, %xmm2
+; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm3
 ; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    kmovw %eax, %k1
-; AVX512VL-32-NEXT:    vmovsd %xmm3, %xmm3, %xmm4 {%k1} {z}
-; AVX512VL-32-NEXT:    vsubsd %xmm4, %xmm2, %xmm2
-; AVX512VL-32-NEXT:    vmovsd %xmm2, (%esp)
+; AVX512VL-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubsd %xmm4, %xmm3, %xmm3
+; AVX512VL-32-NEXT:    vmovsd %xmm3, (%esp)
 ; AVX512VL-32-NEXT:    xorl %edx, %edx
-; AVX512VL-32-NEXT:    vcomisd %xmm3, %xmm1
+; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm2
 ; AVX512VL-32-NEXT:    setae %dl
 ; AVX512VL-32-NEXT:    kmovw %edx, %k1
-; AVX512VL-32-NEXT:    vmovsd %xmm3, %xmm3, %xmm2 {%k1} {z}
-; AVX512VL-32-NEXT:    vsubsd %xmm2, %xmm1, %xmm1
-; AVX512VL-32-NEXT:    vmovsd %xmm1, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512VL-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubsd %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT:    vmovsd %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
 ; AVX512VL-32-NEXT:    xorl %ecx, %ecx
-; AVX512VL-32-NEXT:    vcomisd %xmm3, %xmm1
+; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm2
 ; AVX512VL-32-NEXT:    setae %cl
 ; AVX512VL-32-NEXT:    kmovw %ecx, %k1
-; AVX512VL-32-NEXT:    vmovsd %xmm3, %xmm3, %xmm2 {%k1} {z}
-; AVX512VL-32-NEXT:    vsubsd %xmm2, %xmm1, %xmm1
-; AVX512VL-32-NEXT:    vmovsd %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubsd %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT:    vmovsd %xmm2, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    xorl %ebx, %ebx
-; AVX512VL-32-NEXT:    vcomisd %xmm3, %xmm0
+; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm0
 ; AVX512VL-32-NEXT:    setae %bl
 ; AVX512VL-32-NEXT:    kmovw %ebx, %k1
-; AVX512VL-32-NEXT:    vmovsd %xmm3, %xmm3, %xmm1 {%k1} {z}
+; AVX512VL-32-NEXT:    vmovsd %xmm1, %xmm1, %xmm1 {%k1} {z}
 ; AVX512VL-32-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 ; AVX512VL-32-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fldl (%esp)
@@ -908,36 +908,36 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX512F-32-NEXT:    andl $-8, %esp
 ; AVX512F-32-NEXT:    subl $40, %esp
 ; AVX512F-32-NEXT:    .cfi_offset %ebx, -12
-; AVX512F-32-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; AVX512F-32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512F-32-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512F-32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVX512F-32-NEXT:    xorl %eax, %eax
-; AVX512F-32-NEXT:    vcomiss %xmm2, %xmm1
+; AVX512F-32-NEXT:    vcomiss %xmm1, %xmm2
 ; AVX512F-32-NEXT:    setae %al
 ; AVX512F-32-NEXT:    kmovw %eax, %k1
-; AVX512F-32-NEXT:    vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
-; AVX512F-32-NEXT:    vsubss %xmm3, %xmm1, %xmm1
-; AVX512F-32-NEXT:    vmovss %xmm1, (%esp)
-; AVX512F-32-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512F-32-NEXT:    vmovss %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512F-32-NEXT:    vsubss %xmm3, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vmovss %xmm2, (%esp)
+; AVX512F-32-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
 ; AVX512F-32-NEXT:    xorl %edx, %edx
-; AVX512F-32-NEXT:    vcomiss %xmm2, %xmm1
+; AVX512F-32-NEXT:    vcomiss %xmm1, %xmm2
 ; AVX512F-32-NEXT:    setae %dl
 ; AVX512F-32-NEXT:    kmovw %edx, %k1
-; AVX512F-32-NEXT:    vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
-; AVX512F-32-NEXT:    vsubss %xmm3, %xmm1, %xmm1
-; AVX512F-32-NEXT:    vmovss %xmm1, {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512F-32-NEXT:    vmovss %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512F-32-NEXT:    vsubss %xmm3, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vmovss %xmm2, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; AVX512F-32-NEXT:    xorl %ecx, %ecx
-; AVX512F-32-NEXT:    vcomiss %xmm2, %xmm1
+; AVX512F-32-NEXT:    vcomiss %xmm1, %xmm2
 ; AVX512F-32-NEXT:    setae %cl
 ; AVX512F-32-NEXT:    kmovw %ecx, %k1
-; AVX512F-32-NEXT:    vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
-; AVX512F-32-NEXT:    vsubss %xmm3, %xmm1, %xmm1
-; AVX512F-32-NEXT:    vmovss %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vmovss %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512F-32-NEXT:    vsubss %xmm3, %xmm2, %xmm2
+; AVX512F-32-NEXT:    vmovss %xmm2, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    xorl %ebx, %ebx
-; AVX512F-32-NEXT:    vcomiss %xmm2, %xmm0
+; AVX512F-32-NEXT:    vcomiss %xmm1, %xmm0
 ; AVX512F-32-NEXT:    setae %bl
 ; AVX512F-32-NEXT:    kmovw %ebx, %k1
-; AVX512F-32-NEXT:    vmovss %xmm2, %xmm2, %xmm1 {%k1} {z}
+; AVX512F-32-NEXT:    vmovss %xmm1, %xmm1, %xmm1 {%k1} {z}
 ; AVX512F-32-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; AVX512F-32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    flds (%esp)
@@ -1001,36 +1001,36 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    andl $-8, %esp
 ; AVX512VL-32-NEXT:    subl $40, %esp
 ; AVX512VL-32-NEXT:    .cfi_offset %ebx, -12
-; AVX512VL-32-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; AVX512VL-32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512VL-32-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512VL-32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVX512VL-32-NEXT:    xorl %eax, %eax
-; AVX512VL-32-NEXT:    vcomiss %xmm2, %xmm1
+; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm2
 ; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    kmovw %eax, %k1
-; AVX512VL-32-NEXT:    vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
-; AVX512VL-32-NEXT:    vsubss %xmm3, %xmm1, %xmm1
-; AVX512VL-32-NEXT:    vmovss %xmm1, (%esp)
-; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512VL-32-NEXT:    vmovss %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubss %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT:    vmovss %xmm2, (%esp)
+; AVX512VL-32-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
 ; AVX512VL-32-NEXT:    xorl %edx, %edx
-; AVX512VL-32-NEXT:    vcomiss %xmm2, %xmm1
+; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm2
 ; AVX512VL-32-NEXT:    setae %dl
 ; AVX512VL-32-NEXT:    kmovw %edx, %k1
-; AVX512VL-32-NEXT:    vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
-; AVX512VL-32-NEXT:    vsubss %xmm3, %xmm1, %xmm1
-; AVX512VL-32-NEXT:    vmovss %xmm1, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512VL-32-NEXT:    vmovss %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubss %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT:    vmovss %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; AVX512VL-32-NEXT:    xorl %ecx, %ecx
-; AVX512VL-32-NEXT:    vcomiss %xmm2, %xmm1
+; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm2
 ; AVX512VL-32-NEXT:    setae %cl
 ; AVX512VL-32-NEXT:    kmovw %ecx, %k1
-; AVX512VL-32-NEXT:    vmovss %xmm2, %xmm2, %xmm3 {%k1} {z}
-; AVX512VL-32-NEXT:    vsubss %xmm3, %xmm1, %xmm1
-; AVX512VL-32-NEXT:    vmovss %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vmovss %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT:    vsubss %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT:    vmovss %xmm2, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    xorl %ebx, %ebx
-; AVX512VL-32-NEXT:    vcomiss %xmm2, %xmm0
+; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm0
 ; AVX512VL-32-NEXT:    setae %bl
 ; AVX512VL-32-NEXT:    kmovw %ebx, %k1
-; AVX512VL-32-NEXT:    vmovss %xmm2, %xmm2, %xmm1 {%k1} {z}
+; AVX512VL-32-NEXT:    vmovss %xmm1, %xmm1, %xmm1 {%k1} {z}
 ; AVX512VL-32-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; AVX512VL-32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    flds (%esp)

diff  --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
index f0d861f2973b6..89331a228ccba 100644
--- a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
@@ -369,14 +369,14 @@ define <8 x double> @uitofp_v8i64_v8f64(<8 x i64> %x) #0 {
 ; NODQ-32-NEXT:    .cfi_def_cfa_register %ebp
 ; NODQ-32-NEXT:    andl $-8, %esp
 ; NODQ-32-NEXT:    subl $128, %esp
-; NODQ-32-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
-; NODQ-32-NEXT:    vmovlps %xmm2, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vpermilps {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vextractf32x4 $3, %zmm0, %xmm3
+; NODQ-32-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
 ; NODQ-32-NEXT:    vmovlps %xmm3, {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    vpermilps {{.*#+}} xmm1 = xmm3[2,3,2,3]
 ; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vextractf32x4 $3, %zmm0, %xmm2
+; NODQ-32-NEXT:    vmovlps %xmm2, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    vpermilps {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
@@ -384,25 +384,25 @@ define <8 x double> @uitofp_v8i64_v8f64(<8 x i64> %x) #0 {
 ; NODQ-32-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    vpermilps {{.*#+}} xmm4 = xmm1[2,3,2,3]
 ; NODQ-32-NEXT:    vmovlps %xmm4, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT:    vextractps $1, %xmm2, %eax
+; NODQ-32-NEXT:    vextractps $1, %xmm3, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
 ; NODQ-32-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    wait
-; NODQ-32-NEXT:    vextractps $3, %xmm2, %eax
+; NODQ-32-NEXT:    vextractps $3, %xmm3, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
 ; NODQ-32-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    wait
-; NODQ-32-NEXT:    vextractps $1, %xmm3, %eax
+; NODQ-32-NEXT:    vextractps $1, %xmm2, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
 ; NODQ-32-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    wait
-; NODQ-32-NEXT:    vextractps $3, %xmm3, %eax
+; NODQ-32-NEXT:    vextractps $3, %xmm2, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)

diff  --git a/llvm/test/CodeGen/X86/vec_shift4.ll b/llvm/test/CodeGen/X86/vec_shift4.ll
index bcef1d4d51b2b..25a8055ae0ddc 100644
--- a/llvm/test/CodeGen/X86/vec_shift4.ll
+++ b/llvm/test/CodeGen/X86/vec_shift4.ll
@@ -27,25 +27,26 @@ entry:
 define <2 x i64> @shl2(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp {
 ; X86-LABEL: shl2:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movdqa %xmm0, %xmm2
-; X86-NEXT:    psllw $5, %xmm1
+; X86-NEXT:    movdqa %xmm1, %xmm2
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psllw $5, %xmm2
 ; X86-NEXT:    movdqa %xmm0, %xmm3
 ; X86-NEXT:    psllw $4, %xmm3
 ; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
-; X86-NEXT:    movdqa %xmm1, %xmm0
-; X86-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
-; X86-NEXT:    movdqa %xmm2, %xmm3
+; X86-NEXT:    movdqa %xmm2, %xmm0
+; X86-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
+; X86-NEXT:    movdqa %xmm1, %xmm3
 ; X86-NEXT:    psllw $2, %xmm3
 ; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
-; X86-NEXT:    paddb %xmm1, %xmm1
-; X86-NEXT:    movdqa %xmm1, %xmm0
-; X86-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
-; X86-NEXT:    movdqa %xmm2, %xmm3
-; X86-NEXT:    paddb %xmm2, %xmm3
-; X86-NEXT:    paddb %xmm1, %xmm1
-; X86-NEXT:    movdqa %xmm1, %xmm0
-; X86-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; X86-NEXT:    paddb %xmm2, %xmm2
+; X86-NEXT:    movdqa %xmm2, %xmm0
+; X86-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
+; X86-NEXT:    movdqa %xmm1, %xmm3
+; X86-NEXT:    paddb %xmm1, %xmm3
+; X86-NEXT:    paddb %xmm2, %xmm2
 ; X86-NEXT:    movdqa %xmm2, %xmm0
+; X86-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
+; X86-NEXT:    movdqa %xmm1, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: shl2:

diff  --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index 5e9497f78fc2d..91ed7ef486824 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -2971,7 +2971,7 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; SSE2-NEXT:    addq %rsi, %rcx
 ; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    mulq %r8
-; SSE2-NEXT:    movq %rax, %rdi
+; SSE2-NEXT:    movq %rax, %r8
 ; SSE2-NEXT:    movq %rdx, %rsi
 ; SSE2-NEXT:    addq %rcx, %rsi
 ; SSE2-NEXT:    setb %cl
@@ -2980,33 +2980,33 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; SSE2-NEXT:    testq %r9, %r9
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    testq %r12, %r12
-; SSE2-NEXT:    setne %r8b
-; SSE2-NEXT:    andb %al, %r8b
+; SSE2-NEXT:    setne %r10b
+; SSE2-NEXT:    andb %al, %r10b
 ; SSE2-NEXT:    movq %r12, %rax
 ; SSE2-NEXT:    mulq %r15
-; SSE2-NEXT:    movq %rax, %rbp
-; SSE2-NEXT:    seto %r10b
+; SSE2-NEXT:    movq %rax, %rdi
+; SSE2-NEXT:    seto %bpl
 ; SSE2-NEXT:    movq %r9, %rax
 ; SSE2-NEXT:    mulq %r11
 ; SSE2-NEXT:    movq %rax, %rbx
 ; SSE2-NEXT:    seto %r9b
-; SSE2-NEXT:    orb %r10b, %r9b
-; SSE2-NEXT:    addq %rbp, %rbx
+; SSE2-NEXT:    orb %bpl, %r9b
+; SSE2-NEXT:    addq %rdi, %rbx
 ; SSE2-NEXT:    movq %r11, %rax
 ; SSE2-NEXT:    mulq %r15
 ; SSE2-NEXT:    addq %rbx, %rdx
 ; SSE2-NEXT:    setb %bl
 ; SSE2-NEXT:    orb %r9b, %bl
-; SSE2-NEXT:    orb %r8b, %bl
-; SSE2-NEXT:    movzbl %bl, %ebp
-; SSE2-NEXT:    negl %ebp
-; SSE2-NEXT:    movd %ebp, %xmm1
+; SSE2-NEXT:    orb %r10b, %bl
+; SSE2-NEXT:    movzbl %bl, %edi
+; SSE2-NEXT:    negl %edi
+; SSE2-NEXT:    movd %edi, %xmm1
 ; SSE2-NEXT:    movzbl %cl, %ecx
 ; SSE2-NEXT:    negl %ecx
 ; SSE2-NEXT:    movd %ecx, %xmm0
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    movq %rax, 16(%r14)
-; SSE2-NEXT:    movq %rdi, (%r14)
+; SSE2-NEXT:    movq %r8, (%r14)
 ; SSE2-NEXT:    movq %rdx, 24(%r14)
 ; SSE2-NEXT:    movq %rsi, 8(%r14)
 ; SSE2-NEXT:    popq %rbx
@@ -3048,7 +3048,7 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; SSSE3-NEXT:    addq %rsi, %rcx
 ; SSSE3-NEXT:    movq %rdi, %rax
 ; SSSE3-NEXT:    mulq %r8
-; SSSE3-NEXT:    movq %rax, %rdi
+; SSSE3-NEXT:    movq %rax, %r8
 ; SSSE3-NEXT:    movq %rdx, %rsi
 ; SSSE3-NEXT:    addq %rcx, %rsi
 ; SSSE3-NEXT:    setb %cl
@@ -3057,33 +3057,33 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; SSSE3-NEXT:    testq %r9, %r9
 ; SSSE3-NEXT:    setne %al
 ; SSSE3-NEXT:    testq %r12, %r12
-; SSSE3-NEXT:    setne %r8b
-; SSSE3-NEXT:    andb %al, %r8b
+; SSSE3-NEXT:    setne %r10b
+; SSSE3-NEXT:    andb %al, %r10b
 ; SSSE3-NEXT:    movq %r12, %rax
 ; SSSE3-NEXT:    mulq %r15
-; SSSE3-NEXT:    movq %rax, %rbp
-; SSSE3-NEXT:    seto %r10b
+; SSSE3-NEXT:    movq %rax, %rdi
+; SSSE3-NEXT:    seto %bpl
 ; SSSE3-NEXT:    movq %r9, %rax
 ; SSSE3-NEXT:    mulq %r11
 ; SSSE3-NEXT:    movq %rax, %rbx
 ; SSSE3-NEXT:    seto %r9b
-; SSSE3-NEXT:    orb %r10b, %r9b
-; SSSE3-NEXT:    addq %rbp, %rbx
+; SSSE3-NEXT:    orb %bpl, %r9b
+; SSSE3-NEXT:    addq %rdi, %rbx
 ; SSSE3-NEXT:    movq %r11, %rax
 ; SSSE3-NEXT:    mulq %r15
 ; SSSE3-NEXT:    addq %rbx, %rdx
 ; SSSE3-NEXT:    setb %bl
 ; SSSE3-NEXT:    orb %r9b, %bl
-; SSSE3-NEXT:    orb %r8b, %bl
-; SSSE3-NEXT:    movzbl %bl, %ebp
-; SSSE3-NEXT:    negl %ebp
-; SSSE3-NEXT:    movd %ebp, %xmm1
+; SSSE3-NEXT:    orb %r10b, %bl
+; SSSE3-NEXT:    movzbl %bl, %edi
+; SSSE3-NEXT:    negl %edi
+; SSSE3-NEXT:    movd %edi, %xmm1
 ; SSSE3-NEXT:    movzbl %cl, %ecx
 ; SSSE3-NEXT:    negl %ecx
 ; SSSE3-NEXT:    movd %ecx, %xmm0
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    movq %rax, 16(%r14)
-; SSSE3-NEXT:    movq %rdi, (%r14)
+; SSSE3-NEXT:    movq %r8, (%r14)
 ; SSSE3-NEXT:    movq %rdx, 24(%r14)
 ; SSSE3-NEXT:    movq %rsi, 8(%r14)
 ; SSSE3-NEXT:    popq %rbx
@@ -3125,7 +3125,7 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; SSE41-NEXT:    addq %rsi, %rcx
 ; SSE41-NEXT:    movq %rdi, %rax
 ; SSE41-NEXT:    mulq %r8
-; SSE41-NEXT:    movq %rax, %rdi
+; SSE41-NEXT:    movq %rax, %r8
 ; SSE41-NEXT:    movq %rdx, %rsi
 ; SSE41-NEXT:    addq %rcx, %rsi
 ; SSE41-NEXT:    setb %cl
@@ -3134,32 +3134,32 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; SSE41-NEXT:    testq %r9, %r9
 ; SSE41-NEXT:    setne %al
 ; SSE41-NEXT:    testq %r12, %r12
-; SSE41-NEXT:    setne %r8b
-; SSE41-NEXT:    andb %al, %r8b
+; SSE41-NEXT:    setne %r10b
+; SSE41-NEXT:    andb %al, %r10b
 ; SSE41-NEXT:    movq %r12, %rax
 ; SSE41-NEXT:    mulq %r15
-; SSE41-NEXT:    movq %rax, %rbp
-; SSE41-NEXT:    seto %r10b
+; SSE41-NEXT:    movq %rax, %rdi
+; SSE41-NEXT:    seto %bpl
 ; SSE41-NEXT:    movq %r9, %rax
 ; SSE41-NEXT:    mulq %r11
 ; SSE41-NEXT:    movq %rax, %rbx
 ; SSE41-NEXT:    seto %r9b
-; SSE41-NEXT:    orb %r10b, %r9b
-; SSE41-NEXT:    addq %rbp, %rbx
+; SSE41-NEXT:    orb %bpl, %r9b
+; SSE41-NEXT:    addq %rdi, %rbx
 ; SSE41-NEXT:    movq %r11, %rax
 ; SSE41-NEXT:    mulq %r15
 ; SSE41-NEXT:    addq %rbx, %rdx
 ; SSE41-NEXT:    setb %bl
 ; SSE41-NEXT:    orb %r9b, %bl
-; SSE41-NEXT:    orb %r8b, %bl
-; SSE41-NEXT:    movzbl %bl, %ebp
-; SSE41-NEXT:    negl %ebp
+; SSE41-NEXT:    orb %r10b, %bl
+; SSE41-NEXT:    movzbl %bl, %edi
+; SSE41-NEXT:    negl %edi
 ; SSE41-NEXT:    movzbl %cl, %ecx
 ; SSE41-NEXT:    negl %ecx
 ; SSE41-NEXT:    movd %ecx, %xmm0
-; SSE41-NEXT:    pinsrd $1, %ebp, %xmm0
+; SSE41-NEXT:    pinsrd $1, %edi, %xmm0
 ; SSE41-NEXT:    movq %rax, 16(%r14)
-; SSE41-NEXT:    movq %rdi, (%r14)
+; SSE41-NEXT:    movq %r8, (%r14)
 ; SSE41-NEXT:    movq %rdx, 24(%r14)
 ; SSE41-NEXT:    movq %rsi, 8(%r14)
 ; SSE41-NEXT:    popq %rbx
@@ -3201,7 +3201,7 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; AVX-NEXT:    addq %rsi, %rcx
 ; AVX-NEXT:    movq %rdi, %rax
 ; AVX-NEXT:    mulq %r8
-; AVX-NEXT:    movq %rax, %rdi
+; AVX-NEXT:    movq %rax, %r8
 ; AVX-NEXT:    movq %rdx, %rsi
 ; AVX-NEXT:    addq %rcx, %rsi
 ; AVX-NEXT:    setb %cl
@@ -3210,32 +3210,32 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; AVX-NEXT:    testq %r9, %r9
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    testq %r12, %r12
-; AVX-NEXT:    setne %r8b
-; AVX-NEXT:    andb %al, %r8b
+; AVX-NEXT:    setne %r10b
+; AVX-NEXT:    andb %al, %r10b
 ; AVX-NEXT:    movq %r12, %rax
 ; AVX-NEXT:    mulq %r15
-; AVX-NEXT:    movq %rax, %rbp
-; AVX-NEXT:    seto %r10b
+; AVX-NEXT:    movq %rax, %rdi
+; AVX-NEXT:    seto %bpl
 ; AVX-NEXT:    movq %r9, %rax
 ; AVX-NEXT:    mulq %r11
 ; AVX-NEXT:    movq %rax, %rbx
 ; AVX-NEXT:    seto %r9b
-; AVX-NEXT:    orb %r10b, %r9b
-; AVX-NEXT:    addq %rbp, %rbx
+; AVX-NEXT:    orb %bpl, %r9b
+; AVX-NEXT:    addq %rdi, %rbx
 ; AVX-NEXT:    movq %r11, %rax
 ; AVX-NEXT:    mulq %r15
 ; AVX-NEXT:    addq %rbx, %rdx
 ; AVX-NEXT:    setb %bl
 ; AVX-NEXT:    orb %r9b, %bl
-; AVX-NEXT:    orb %r8b, %bl
-; AVX-NEXT:    movzbl %bl, %ebp
-; AVX-NEXT:    negl %ebp
+; AVX-NEXT:    orb %r10b, %bl
+; AVX-NEXT:    movzbl %bl, %edi
+; AVX-NEXT:    negl %edi
 ; AVX-NEXT:    movzbl %cl, %ecx
 ; AVX-NEXT:    negl %ecx
 ; AVX-NEXT:    vmovd %ecx, %xmm0
-; AVX-NEXT:    vpinsrd $1, %ebp, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
 ; AVX-NEXT:    movq %rax, 16(%r14)
-; AVX-NEXT:    movq %rdi, (%r14)
+; AVX-NEXT:    movq %r8, (%r14)
 ; AVX-NEXT:    movq %rdx, 24(%r14)
 ; AVX-NEXT:    movq %rsi, 8(%r14)
 ; AVX-NEXT:    popq %rbx

diff  --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index d0f6325272768..2e15d7e32cec8 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -1749,29 +1749,29 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ;
 ; X86-SSE2-LABEL: splatvar_funnnel_v16i8:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
-; X86-SSE2-NEXT:    pand %xmm3, %xmm4
-; X86-SSE2-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
-; X86-SSE2-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-SSE2-NEXT:    psllw %xmm4, %xmm0
-; X86-SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT:    pand %xmm4, %xmm5
+; X86-SSE2-NEXT:    pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0]
+; X86-SSE2-NEXT:    psrldq {{.*#+}} xmm5 = xmm5[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X86-SSE2-NEXT:    psllw %xmm5, %xmm0
+; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
 ; X86-SSE2-NEXT:    pcmpeqd %xmm6, %xmm6
-; X86-SSE2-NEXT:    psllw %xmm4, %xmm6
+; X86-SSE2-NEXT:    psllw %xmm5, %xmm6
 ; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm6[0,0,0,0,4,5,6,7]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
-; X86-SSE2-NEXT:    pand %xmm4, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm3, %xmm2
+; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm6[0,0,0,0,4,5,6,7]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; X86-SSE2-NEXT:    pand %xmm5, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm4, %xmm2
 ; X86-SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
 ; X86-SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; X86-SSE2-NEXT:    psrlw $1, %xmm1
 ; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-SSE2-NEXT:    psrlw %xmm2, %xmm1
-; X86-SSE2-NEXT:    psrlw %xmm2, %xmm5
-; X86-SSE2-NEXT:    psrlw $8, %xmm5
-; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm5[0,0,0,0,4,5,6,7]
+; X86-SSE2-NEXT:    psrlw %xmm2, %xmm3
+; X86-SSE2-NEXT:    psrlw $8, %xmm3
+; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm3[0,0,0,0,4,5,6,7]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; X86-SSE2-NEXT:    pand %xmm1, %xmm2
 ; X86-SSE2-NEXT:    por %xmm2, %xmm0

diff  --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
index 1e41aa019f7c1..333a8b439a04b 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
@@ -1845,26 +1845,26 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ;
 ; X86-SSE2-LABEL: constant_funnnel_v16i8:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT:    psrlw $8, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
 ; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
 ; X86-SSE2-NEXT:    psrlw $8, %xmm3
-; X86-SSE2-NEXT:    packuswb %xmm2, %xmm3
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    psrlw $8, %xmm1
+; X86-SSE2-NEXT:    packuswb %xmm3, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; X86-SSE2-NEXT:    pand %xmm3, %xmm2
 ; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    packuswb %xmm1, %xmm0
-; X86-SSE2-NEXT:    por %xmm3, %xmm0
+; X86-SSE2-NEXT:    pand %xmm3, %xmm0
+; X86-SSE2-NEXT:    packuswb %xmm2, %xmm0
+; X86-SSE2-NEXT:    por %xmm1, %xmm0
 ; X86-SSE2-NEXT:    retl
   %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
   ret <16 x i8> %res

diff  --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 8c74560cbdd92..ee2d60604cf00 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -1727,28 +1727,28 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ;
 ; X86-SSE2-LABEL: splatvar_funnnel_v16i8:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
-; X86-SSE2-NEXT:    pandn %xmm3, %xmm4
-; X86-SSE2-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
-; X86-SSE2-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT:    pandn %xmm4, %xmm5
+; X86-SSE2-NEXT:    pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0]
+; X86-SSE2-NEXT:    psrldq {{.*#+}} xmm5 = xmm5[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; X86-SSE2-NEXT:    paddb %xmm0, %xmm0
-; X86-SSE2-NEXT:    psllw %xmm4, %xmm0
-; X86-SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
+; X86-SSE2-NEXT:    psllw %xmm5, %xmm0
+; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
 ; X86-SSE2-NEXT:    pcmpeqd %xmm6, %xmm6
-; X86-SSE2-NEXT:    psllw %xmm4, %xmm6
+; X86-SSE2-NEXT:    psllw %xmm5, %xmm6
 ; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm6[0,0,0,0,4,5,6,7]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
-; X86-SSE2-NEXT:    pand %xmm4, %xmm0
-; X86-SSE2-NEXT:    pand %xmm3, %xmm2
+; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm6[0,0,0,0,4,5,6,7]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; X86-SSE2-NEXT:    pand %xmm5, %xmm0
+; X86-SSE2-NEXT:    pand %xmm4, %xmm2
 ; X86-SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
 ; X86-SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; X86-SSE2-NEXT:    psrlw %xmm2, %xmm1
-; X86-SSE2-NEXT:    psrlw %xmm2, %xmm5
-; X86-SSE2-NEXT:    psrlw $8, %xmm5
-; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm5[0,0,0,0,4,5,6,7]
+; X86-SSE2-NEXT:    psrlw %xmm2, %xmm3
+; X86-SSE2-NEXT:    psrlw $8, %xmm3
+; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm3[0,0,0,0,4,5,6,7]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; X86-SSE2-NEXT:    pand %xmm1, %xmm2
 ; X86-SSE2-NEXT:    por %xmm2, %xmm0

diff  --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index 7bd47f0c21f25..d9a772faf79f2 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -1931,26 +1931,26 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ;
 ; X86-SSE2-LABEL: constant_funnnel_v16i8:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT:    psrlw $8, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
 ; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
 ; X86-SSE2-NEXT:    psrlw $8, %xmm3
-; X86-SSE2-NEXT:    packuswb %xmm2, %xmm3
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    psrlw $8, %xmm1
+; X86-SSE2-NEXT:    packuswb %xmm3, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; X86-SSE2-NEXT:    pand %xmm3, %xmm2
 ; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    packuswb %xmm1, %xmm0
-; X86-SSE2-NEXT:    por %xmm3, %xmm0
+; X86-SSE2-NEXT:    pand %xmm3, %xmm0
+; X86-SSE2-NEXT:    packuswb %xmm2, %xmm0
+; X86-SSE2-NEXT:    por %xmm1, %xmm0
 ; X86-SSE2-NEXT:    retl
   %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
   ret <16 x i8> %res

diff  --git a/llvm/test/CodeGen/X86/vector-gep.ll b/llvm/test/CodeGen/X86/vector-gep.ll
index 8925fc90c686c..6180403184350 100644
--- a/llvm/test/CodeGen/X86/vector-gep.ll
+++ b/llvm/test/CodeGen/X86/vector-gep.ll
@@ -152,39 +152,39 @@ define <64 x i16*> @AGEP9(i16* %param, <64 x i32> %off) nounwind {
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
 ; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovdqa 72(%ebp), %xmm3
-; CHECK-NEXT:    vpaddd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpaddd %xmm3, %xmm5, %xmm0
+; CHECK-NEXT:    vmovdqa 72(%ebp), %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
 ; CHECK-NEXT:    vmovdqa %xmm0, (%esp) # 16-byte Spill
-; CHECK-NEXT:    vmovdqa 88(%ebp), %xmm4
-; CHECK-NEXT:    vpaddd %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
-; CHECK-NEXT:    vmovdqa 104(%ebp), %xmm1
-; CHECK-NEXT:    vpaddd %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpaddd %xmm1, %xmm5, %xmm1
-; CHECK-NEXT:    vmovdqa 120(%ebp), %xmm6
+; CHECK-NEXT:    vmovdqa 88(%ebp), %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm2
+; CHECK-NEXT:    vmovdqa 104(%ebp), %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm1
+; CHECK-NEXT:    vmovdqa 120(%ebp), %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
+; CHECK-NEXT:    vmovdqa 136(%ebp), %xmm6
 ; CHECK-NEXT:    vpaddd %xmm6, %xmm6, %xmm6
 ; CHECK-NEXT:    vpaddd %xmm6, %xmm5, %xmm6
-; CHECK-NEXT:    vmovdqa 136(%ebp), %xmm2
-; CHECK-NEXT:    vpaddd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
 ; CHECK-NEXT:    vmovdqa 152(%ebp), %xmm7
 ; CHECK-NEXT:    vpaddd %xmm7, %xmm7, %xmm7
 ; CHECK-NEXT:    vpaddd %xmm7, %xmm5, %xmm7
-; CHECK-NEXT:    vmovdqa 168(%ebp), %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
+; CHECK-NEXT:    vmovdqa 168(%ebp), %xmm4
+; CHECK-NEXT:    vpaddd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
 ; CHECK-NEXT:    vmovdqa 184(%ebp), %xmm3
 ; CHECK-NEXT:    vpaddd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vpaddd %xmm3, %xmm5, %xmm3
 ; CHECK-NEXT:    movl 8(%ebp), %eax
 ; CHECK-NEXT:    vmovdqa %xmm3, 240(%eax)
-; CHECK-NEXT:    vmovdqa %xmm0, 224(%eax)
+; CHECK-NEXT:    vmovdqa %xmm4, 224(%eax)
 ; CHECK-NEXT:    vmovdqa %xmm7, 208(%eax)
-; CHECK-NEXT:    vmovdqa %xmm2, 192(%eax)
-; CHECK-NEXT:    vmovdqa %xmm6, 176(%eax)
+; CHECK-NEXT:    vmovdqa %xmm6, 192(%eax)
+; CHECK-NEXT:    vmovdqa %xmm0, 176(%eax)
 ; CHECK-NEXT:    vmovdqa %xmm1, 160(%eax)
-; CHECK-NEXT:    vmovdqa %xmm4, 144(%eax)
+; CHECK-NEXT:    vmovdqa %xmm2, 144(%eax)
 ; CHECK-NEXT:    vmovaps (%esp), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vmovaps %xmm0, 128(%eax)
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload

diff  --git a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll
index 07dc0fe0b44f8..a54101e78db00 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll
@@ -131,27 +131,27 @@ define void @test_sdiv7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; X86-NEXT:    movdqa %xmm0, %xmm2
-; X86-NEXT:    pmuludq %xmm1, %xmm2
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X86-NEXT:    movdqa %xmm0, %xmm3
+; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; X86-NEXT:    movdqa %xmm1, %xmm0
+; X86-NEXT:    pmuludq %xmm2, %xmm0
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-NEXT:    movdqa %xmm1, %xmm3
 ; X86-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,1,1]
-; X86-NEXT:    pmuludq %xmm1, %xmm3
+; X86-NEXT:    pmuludq %xmm2, %xmm3
 ; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
 ; X86-NEXT:    pxor %xmm3, %xmm3
-; X86-NEXT:    pcmpgtd %xmm0, %xmm3
-; X86-NEXT:    pand %xmm1, %xmm3
-; X86-NEXT:    paddd %xmm0, %xmm3
-; X86-NEXT:    psubd %xmm3, %xmm2
-; X86-NEXT:    paddd %xmm0, %xmm2
-; X86-NEXT:    movdqa %xmm2, %xmm0
-; X86-NEXT:    psrld $31, %xmm0
-; X86-NEXT:    psrad $2, %xmm2
-; X86-NEXT:    paddd %xmm0, %xmm2
-; X86-NEXT:    movq %xmm2, (%eax)
+; X86-NEXT:    pcmpgtd %xmm1, %xmm3
+; X86-NEXT:    pand %xmm2, %xmm3
+; X86-NEXT:    paddd %xmm1, %xmm3
+; X86-NEXT:    psubd %xmm3, %xmm0
+; X86-NEXT:    paddd %xmm1, %xmm0
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psrld $31, %xmm1
+; X86-NEXT:    psrad $2, %xmm0
+; X86-NEXT:    paddd %xmm1, %xmm0
+; X86-NEXT:    movq %xmm0, (%eax)
 ; X86-NEXT:    retl
   %a = load <2 x i32>, <2 x i32>* %x
   %b = sdiv <2 x i32> %a, <i32 7, i32 7>
@@ -193,30 +193,30 @@ define void @test_srem7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; X86-NEXT:    movdqa %xmm0, %xmm2
-; X86-NEXT:    pmuludq %xmm1, %xmm2
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X86-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    pmuludq %xmm2, %xmm1
+; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; X86-NEXT:    movdqa %xmm0, %xmm3
 ; X86-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,1,1]
-; X86-NEXT:    pmuludq %xmm1, %xmm3
+; X86-NEXT:    pmuludq %xmm2, %xmm3
 ; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; X86-NEXT:    pxor %xmm3, %xmm3
 ; X86-NEXT:    pcmpgtd %xmm0, %xmm3
-; X86-NEXT:    pand %xmm1, %xmm3
+; X86-NEXT:    pand %xmm2, %xmm3
 ; X86-NEXT:    paddd %xmm0, %xmm3
-; X86-NEXT:    psubd %xmm3, %xmm2
-; X86-NEXT:    paddd %xmm0, %xmm2
-; X86-NEXT:    movdqa %xmm2, %xmm1
-; X86-NEXT:    psrld $31, %xmm1
-; X86-NEXT:    psrad $2, %xmm2
-; X86-NEXT:    paddd %xmm1, %xmm2
-; X86-NEXT:    movdqa %xmm2, %xmm1
-; X86-NEXT:    pslld $3, %xmm1
-; X86-NEXT:    psubd %xmm1, %xmm2
-; X86-NEXT:    paddd %xmm0, %xmm2
-; X86-NEXT:    movq %xmm2, (%eax)
+; X86-NEXT:    psubd %xmm3, %xmm1
+; X86-NEXT:    paddd %xmm0, %xmm1
+; X86-NEXT:    movdqa %xmm1, %xmm2
+; X86-NEXT:    psrld $31, %xmm2
+; X86-NEXT:    psrad $2, %xmm1
+; X86-NEXT:    paddd %xmm2, %xmm1
+; X86-NEXT:    movdqa %xmm1, %xmm2
+; X86-NEXT:    pslld $3, %xmm2
+; X86-NEXT:    psubd %xmm2, %xmm1
+; X86-NEXT:    paddd %xmm0, %xmm1
+; X86-NEXT:    movq %xmm1, (%eax)
 ; X86-NEXT:    retl
   %a = load <2 x i32>, <2 x i32>* %x
   %b = srem <2 x i32> %a, <i32 7, i32 7>

diff  --git a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
index 6a9a4f6ded5f3..d0d35c986a307 100644
--- a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
@@ -231,30 +231,30 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ;
 ; X32-SSE-LABEL: testv2i64:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
-; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm4
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
 ; X32-SSE-NEXT:    psrlw $4, %xmm1
 ; X32-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X32-SSE-NEXT:    pxor %xmm4, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm2
-; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm1
-; X32-SSE-NEXT:    pand %xmm3, %xmm1
-; X32-SSE-NEXT:    paddb %xmm2, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm2
-; X32-SSE-NEXT:    psrlw $8, %xmm2
-; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
+; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
+; X32-SSE-NEXT:    pand %xmm4, %xmm1
+; X32-SSE-NEXT:    paddb %xmm3, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm3
+; X32-SSE-NEXT:    psrlw $8, %xmm3
+; X32-SSE-NEXT:    pand %xmm1, %xmm3
 ; X32-SSE-NEXT:    psrlw $8, %xmm1
-; X32-SSE-NEXT:    paddw %xmm2, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X32-SSE-NEXT:    pcmpeqw %xmm4, %xmm2
-; X32-SSE-NEXT:    psrld $16, %xmm2
-; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    paddw %xmm3, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    pcmpeqw %xmm2, %xmm3
+; X32-SSE-NEXT:    psrld $16, %xmm3
+; X32-SSE-NEXT:    pand %xmm1, %xmm3
 ; X32-SSE-NEXT:    psrld $16, %xmm1
-; X32-SSE-NEXT:    paddd %xmm2, %xmm1
-; X32-SSE-NEXT:    pcmpeqd %xmm4, %xmm0
+; X32-SSE-NEXT:    paddd %xmm3, %xmm1
+; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm0
 ; X32-SSE-NEXT:    psrlq $32, %xmm0
 ; X32-SSE-NEXT:    pand %xmm1, %xmm0
 ; X32-SSE-NEXT:    psrlq $32, %xmm1
@@ -484,30 +484,30 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ;
 ; X32-SSE-LABEL: testv2i64u:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
-; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm4
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
 ; X32-SSE-NEXT:    psrlw $4, %xmm1
 ; X32-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X32-SSE-NEXT:    pxor %xmm4, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm2
-; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm1
-; X32-SSE-NEXT:    pand %xmm3, %xmm1
-; X32-SSE-NEXT:    paddb %xmm2, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm2
-; X32-SSE-NEXT:    psrlw $8, %xmm2
-; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
+; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
+; X32-SSE-NEXT:    pand %xmm4, %xmm1
+; X32-SSE-NEXT:    paddb %xmm3, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm3
+; X32-SSE-NEXT:    psrlw $8, %xmm3
+; X32-SSE-NEXT:    pand %xmm1, %xmm3
 ; X32-SSE-NEXT:    psrlw $8, %xmm1
-; X32-SSE-NEXT:    paddw %xmm2, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X32-SSE-NEXT:    pcmpeqw %xmm4, %xmm2
-; X32-SSE-NEXT:    psrld $16, %xmm2
-; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    paddw %xmm3, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    pcmpeqw %xmm2, %xmm3
+; X32-SSE-NEXT:    psrld $16, %xmm3
+; X32-SSE-NEXT:    pand %xmm1, %xmm3
 ; X32-SSE-NEXT:    psrld $16, %xmm1
-; X32-SSE-NEXT:    paddd %xmm2, %xmm1
-; X32-SSE-NEXT:    pcmpeqd %xmm4, %xmm0
+; X32-SSE-NEXT:    paddd %xmm3, %xmm1
+; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm0
 ; X32-SSE-NEXT:    psrlq $32, %xmm0
 ; X32-SSE-NEXT:    pand %xmm1, %xmm0
 ; X32-SSE-NEXT:    psrlq $32, %xmm1
@@ -719,24 +719,24 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; X32-SSE-LABEL: testv4i32:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
-; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm4
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
 ; X32-SSE-NEXT:    psrlw $4, %xmm1
 ; X32-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X32-SSE-NEXT:    pxor %xmm4, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm2
-; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm1
-; X32-SSE-NEXT:    pand %xmm3, %xmm1
-; X32-SSE-NEXT:    paddb %xmm2, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm2
-; X32-SSE-NEXT:    psrlw $8, %xmm2
-; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
+; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
+; X32-SSE-NEXT:    pand %xmm4, %xmm1
+; X32-SSE-NEXT:    paddb %xmm3, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm3
+; X32-SSE-NEXT:    psrlw $8, %xmm3
+; X32-SSE-NEXT:    pand %xmm1, %xmm3
 ; X32-SSE-NEXT:    psrlw $8, %xmm1
-; X32-SSE-NEXT:    paddw %xmm2, %xmm1
-; X32-SSE-NEXT:    pcmpeqw %xmm4, %xmm0
+; X32-SSE-NEXT:    paddw %xmm3, %xmm1
+; X32-SSE-NEXT:    pcmpeqw %xmm2, %xmm0
 ; X32-SSE-NEXT:    psrld $16, %xmm0
 ; X32-SSE-NEXT:    pand %xmm1, %xmm0
 ; X32-SSE-NEXT:    psrld $16, %xmm1
@@ -948,24 +948,24 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ;
 ; X32-SSE-LABEL: testv4i32u:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
-; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm4
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
 ; X32-SSE-NEXT:    psrlw $4, %xmm1
 ; X32-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X32-SSE-NEXT:    pxor %xmm4, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm2
-; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm1
-; X32-SSE-NEXT:    pand %xmm3, %xmm1
-; X32-SSE-NEXT:    paddb %xmm2, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm2
-; X32-SSE-NEXT:    psrlw $8, %xmm2
-; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
+; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
+; X32-SSE-NEXT:    pand %xmm4, %xmm1
+; X32-SSE-NEXT:    paddb %xmm3, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm3
+; X32-SSE-NEXT:    psrlw $8, %xmm3
+; X32-SSE-NEXT:    pand %xmm1, %xmm3
 ; X32-SSE-NEXT:    psrlw $8, %xmm1
-; X32-SSE-NEXT:    paddw %xmm2, %xmm1
-; X32-SSE-NEXT:    pcmpeqw %xmm4, %xmm0
+; X32-SSE-NEXT:    paddw %xmm3, %xmm1
+; X32-SSE-NEXT:    pcmpeqw %xmm2, %xmm0
 ; X32-SSE-NEXT:    psrld $16, %xmm0
 ; X32-SSE-NEXT:    pand %xmm1, %xmm0
 ; X32-SSE-NEXT:    psrld $16, %xmm1

diff  --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index d43d945487218..7447c506fd582 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -1798,26 +1798,26 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
 ;
 ; X86-SSE2-LABEL: constant_rotate_v16i8:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT:    psrlw $8, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
 ; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
 ; X86-SSE2-NEXT:    psrlw $8, %xmm3
-; X86-SSE2-NEXT:    packuswb %xmm2, %xmm3
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    psrlw $8, %xmm1
+; X86-SSE2-NEXT:    packuswb %xmm3, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; X86-SSE2-NEXT:    pand %xmm3, %xmm2
 ; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    packuswb %xmm1, %xmm0
-; X86-SSE2-NEXT:    por %xmm3, %xmm0
+; X86-SSE2-NEXT:    pand %xmm3, %xmm0
+; X86-SSE2-NEXT:    packuswb %xmm2, %xmm0
+; X86-SSE2-NEXT:    por %xmm1, %xmm0
 ; X86-SSE2-NEXT:    retl
   %shl = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
   %lshr = lshr <16 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>

diff  --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll
index 97ba377a1ab15..0a821dca9b828 100644
--- a/llvm/test/CodeGen/X86/vector-sext.ll
+++ b/llvm/test/CodeGen/X86/vector-sext.ll
@@ -3617,26 +3617,26 @@ define <4 x i32> @sext_4i17_to_4i32(<4 x i17>* %ptr) {
 ;
 ; X86-SSE2-LABEL: sext_4i17_to_4i32:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl (%eax), %ecx
-; X86-SSE2-NEXT:    movl 4(%eax), %edx
-; X86-SSE2-NEXT:    movl 8(%eax), %eax
-; X86-SSE2-NEXT:    shldl $13, %edx, %eax
-; X86-SSE2-NEXT:    shll $15, %eax
-; X86-SSE2-NEXT:    sarl $15, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm0
-; X86-SSE2-NEXT:    movl %edx, %eax
-; X86-SSE2-NEXT:    shll $13, %eax
-; X86-SSE2-NEXT:    sarl $15, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm1
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT:    movl (%edx), %ecx
+; X86-SSE2-NEXT:    movl 4(%edx), %eax
+; X86-SSE2-NEXT:    movl 8(%edx), %edx
+; X86-SSE2-NEXT:    shldl $13, %eax, %edx
+; X86-SSE2-NEXT:    shll $15, %edx
+; X86-SSE2-NEXT:    sarl $15, %edx
+; X86-SSE2-NEXT:    movd %edx, %xmm0
+; X86-SSE2-NEXT:    movl %eax, %edx
+; X86-SSE2-NEXT:    shll $13, %edx
+; X86-SSE2-NEXT:    sarl $15, %edx
+; X86-SSE2-NEXT:    movd %edx, %xmm1
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE2-NEXT:    shldl $15, %ecx, %edx
+; X86-SSE2-NEXT:    shldl $15, %ecx, %eax
 ; X86-SSE2-NEXT:    shll $15, %ecx
 ; X86-SSE2-NEXT:    sarl $15, %ecx
 ; X86-SSE2-NEXT:    movd %ecx, %xmm0
-; X86-SSE2-NEXT:    shll $15, %edx
-; X86-SSE2-NEXT:    sarl $15, %edx
-; X86-SSE2-NEXT:    movd %edx, %xmm2
+; X86-SSE2-NEXT:    shll $15, %eax
+; X86-SSE2-NEXT:    sarl $15, %eax
+; X86-SSE2-NEXT:    movd %eax, %xmm2
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; X86-SSE2-NEXT:    retl
@@ -3646,25 +3646,25 @@ define <4 x i32> @sext_4i17_to_4i32(<4 x i17>* %ptr) {
 ; X86-SSE41-NEXT:    pushl %esi
 ; X86-SSE41-NEXT:    .cfi_def_cfa_offset 8
 ; X86-SSE41-NEXT:    .cfi_offset %esi, -8
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl (%eax), %ecx
-; X86-SSE41-NEXT:    movl 4(%eax), %edx
-; X86-SSE41-NEXT:    movl %edx, %esi
-; X86-SSE41-NEXT:    movl 8(%eax), %eax
-; X86-SSE41-NEXT:    shldl $13, %edx, %eax
-; X86-SSE41-NEXT:    shldl $15, %ecx, %edx
-; X86-SSE41-NEXT:    shll $15, %edx
-; X86-SSE41-NEXT:    sarl $15, %edx
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE41-NEXT:    movl (%edx), %eax
+; X86-SSE41-NEXT:    movl 4(%edx), %ecx
+; X86-SSE41-NEXT:    movl %ecx, %esi
+; X86-SSE41-NEXT:    movl 8(%edx), %edx
+; X86-SSE41-NEXT:    shldl $13, %ecx, %edx
+; X86-SSE41-NEXT:    shldl $15, %eax, %ecx
 ; X86-SSE41-NEXT:    shll $15, %ecx
 ; X86-SSE41-NEXT:    sarl $15, %ecx
-; X86-SSE41-NEXT:    movd %ecx, %xmm0
-; X86-SSE41-NEXT:    pinsrd $1, %edx, %xmm0
+; X86-SSE41-NEXT:    shll $15, %eax
+; X86-SSE41-NEXT:    sarl $15, %eax
+; X86-SSE41-NEXT:    movd %eax, %xmm0
+; X86-SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
 ; X86-SSE41-NEXT:    shll $13, %esi
 ; X86-SSE41-NEXT:    sarl $15, %esi
 ; X86-SSE41-NEXT:    pinsrd $2, %esi, %xmm0
-; X86-SSE41-NEXT:    shll $15, %eax
-; X86-SSE41-NEXT:    sarl $15, %eax
-; X86-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
+; X86-SSE41-NEXT:    shll $15, %edx
+; X86-SSE41-NEXT:    sarl $15, %edx
+; X86-SSE41-NEXT:    pinsrd $3, %edx, %xmm0
 ; X86-SSE41-NEXT:    popl %esi
 ; X86-SSE41-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE41-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
index 7999cc4861c83..163486ad41352 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -468,36 +468,36 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ;
 ; X86-AVX1-LABEL: var_shift_v32i8:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; X86-AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm3
-; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X86-AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpsrlw $4, %xmm3, %xmm4
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X86-AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm4
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; X86-AVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
-; X86-AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm3
+; X86-AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpsrlw $2, %xmm3, %xmm4
 ; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; X86-AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
 ; X86-AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
-; X86-AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm3
+; X86-AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm4
 ; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; X86-AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
 ; X86-AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
-; X86-AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
-; X86-AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; X86-AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm2
 ; X86-AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
-; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm3
-; X86-AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vpand %xmm6, %xmm2, %xmm2
 ; X86-AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm3
-; X86-AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
 ; X86-AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: var_shift_v32i8:

diff  --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
index 29a27cc74ad3d..bbcd4875cd1b6 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -246,30 +246,30 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ;
 ; X86-AVX1-LABEL: var_shift_v16i16:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; X86-AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7]
-; X86-AVX1-NEXT:    vpslld $23, %xmm3, %xmm3
-; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; X86-AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
-; X86-AVX1-NEXT:    vcvttps2dq %xmm3, %xmm3
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; X86-AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
-; X86-AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; X86-AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm1[4,4,5,5,6,6,7,7]
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X86-AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4,4,5,5,6,6,7,7]
+; X86-AVX1-NEXT:    vpslld $23, %xmm2, %xmm4
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
+; X86-AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm4
+; X86-AVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
 ; X86-AVX1-NEXT:    vpslld $23, %xmm3, %xmm3
-; X86-AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
 ; X86-AVX1-NEXT:    vcvttps2dq %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT:    vpmullw %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm1[4,4,5,5,6,6,7,7]
+; X86-AVX1-NEXT:    vpslld $23, %xmm4, %xmm4
+; X86-AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm4
+; X86-AVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; X86-AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
-; X86-AVX1-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
-; X86-AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpackusdw %xmm4, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: var_shift_v16i16:

diff  --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
index 924cfc169086a..e8a0ac2808ad4 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
@@ -6186,10 +6186,10 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, <16 x i24>* %y) nounwind {
 ; AVX512-NEXT:    shrl $16, %r14d
 ; AVX512-NEXT:    movb %r14b, 26(%rdi)
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpextrd $3, %xmm0, %edx
-; AVX512-NEXT:    movw %dx, 21(%rdi)
-; AVX512-NEXT:    vpextrd $2, %xmm0, %esi
-; AVX512-NEXT:    movw %si, 18(%rdi)
+; AVX512-NEXT:    vpextrd $3, %xmm0, %esi
+; AVX512-NEXT:    movw %si, 21(%rdi)
+; AVX512-NEXT:    vpextrd $2, %xmm0, %edx
+; AVX512-NEXT:    movw %dx, 18(%rdi)
 ; AVX512-NEXT:    vpextrd $1, %xmm0, %ecx
 ; AVX512-NEXT:    movw %cx, 15(%rdi)
 ; AVX512-NEXT:    vmovd %xmm0, %eax
@@ -6202,10 +6202,10 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, <16 x i24>* %y) nounwind {
 ; AVX512-NEXT:    movb %r9b, 5(%rdi)
 ; AVX512-NEXT:    shrl $16, %r8d
 ; AVX512-NEXT:    movb %r8b, 2(%rdi)
-; AVX512-NEXT:    shrl $16, %edx
-; AVX512-NEXT:    movb %dl, 23(%rdi)
 ; AVX512-NEXT:    shrl $16, %esi
-; AVX512-NEXT:    movb %sil, 20(%rdi)
+; AVX512-NEXT:    movb %sil, 23(%rdi)
+; AVX512-NEXT:    shrl $16, %edx
+; AVX512-NEXT:    movb %dl, 20(%rdi)
 ; AVX512-NEXT:    shrl $16, %ecx
 ; AVX512-NEXT:    movb %cl, 17(%rdi)
 ; AVX512-NEXT:    shrl $16, %eax
@@ -6268,10 +6268,10 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, <16 x i24>* %y) nounwind {
 ; SKX-NEXT:    shrl $16, %r14d
 ; SKX-NEXT:    movb %r14b, 26(%rdi)
 ; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT:    vpextrd $3, %xmm0, %edx
-; SKX-NEXT:    movw %dx, 21(%rdi)
-; SKX-NEXT:    vpextrd $2, %xmm0, %esi
-; SKX-NEXT:    movw %si, 18(%rdi)
+; SKX-NEXT:    vpextrd $3, %xmm0, %esi
+; SKX-NEXT:    movw %si, 21(%rdi)
+; SKX-NEXT:    vpextrd $2, %xmm0, %edx
+; SKX-NEXT:    movw %dx, 18(%rdi)
 ; SKX-NEXT:    vpextrd $1, %xmm0, %ecx
 ; SKX-NEXT:    movw %cx, 15(%rdi)
 ; SKX-NEXT:    vmovd %xmm0, %eax
@@ -6284,10 +6284,10 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, <16 x i24>* %y) nounwind {
 ; SKX-NEXT:    movb %r9b, 5(%rdi)
 ; SKX-NEXT:    shrl $16, %r8d
 ; SKX-NEXT:    movb %r8b, 2(%rdi)
-; SKX-NEXT:    shrl $16, %edx
-; SKX-NEXT:    movb %dl, 23(%rdi)
 ; SKX-NEXT:    shrl $16, %esi
-; SKX-NEXT:    movb %sil, 20(%rdi)
+; SKX-NEXT:    movb %sil, 23(%rdi)
+; SKX-NEXT:    shrl $16, %edx
+; SKX-NEXT:    movb %dl, 20(%rdi)
 ; SKX-NEXT:    shrl $16, %ecx
 ; SKX-NEXT:    movb %cl, 17(%rdi)
 ; SKX-NEXT:    shrl $16, %eax

diff  --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
index 04065cd8d6a57..7128233d83fe2 100644
--- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -643,22 +643,22 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
 ; X32-SSE-NEXT:    paddd %xmm0, %xmm1
 ; X32-SSE-NEXT:    pandn %xmm1, %xmm0
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X32-SSE-NEXT:    pand %xmm1, %xmm2
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm2, %xmm4
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm3, %xmm4
 ; X32-SSE-NEXT:    psrlw $4, %xmm0
-; X32-SSE-NEXT:    pand %xmm1, %xmm0
-; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
-; X32-SSE-NEXT:    paddb %xmm4, %xmm3
-; X32-SSE-NEXT:    pxor %xmm1, %xmm1
-; X32-SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
-; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; X32-SSE-NEXT:    psadbw %xmm1, %xmm3
-; X32-SSE-NEXT:    psadbw %xmm1, %xmm0
-; X32-SSE-NEXT:    packuswb %xmm3, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm1
+; X32-SSE-NEXT:    paddb %xmm4, %xmm1
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X32-SSE-NEXT:    psadbw %xmm2, %xmm1
+; X32-SSE-NEXT:    psadbw %xmm2, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0)
   ret <4 x i32> %out
@@ -886,22 +886,22 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
 ; X32-SSE-NEXT:    paddd %xmm0, %xmm1
 ; X32-SSE-NEXT:    pandn %xmm1, %xmm0
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X32-SSE-NEXT:    pand %xmm1, %xmm2
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm2, %xmm4
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm3, %xmm4
 ; X32-SSE-NEXT:    psrlw $4, %xmm0
-; X32-SSE-NEXT:    pand %xmm1, %xmm0
-; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
-; X32-SSE-NEXT:    paddb %xmm4, %xmm3
-; X32-SSE-NEXT:    pxor %xmm1, %xmm1
-; X32-SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
-; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; X32-SSE-NEXT:    psadbw %xmm1, %xmm3
-; X32-SSE-NEXT:    psadbw %xmm1, %xmm0
-; X32-SSE-NEXT:    packuswb %xmm3, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm1
+; X32-SSE-NEXT:    paddb %xmm4, %xmm1
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X32-SSE-NEXT:    psadbw %xmm2, %xmm1
+; X32-SSE-NEXT:    psadbw %xmm2, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1)
   ret <4 x i32> %out

diff  --git a/llvm/test/CodeGen/X86/vshift-6.ll b/llvm/test/CodeGen/X86/vshift-6.ll
index 4c76f2b4acd54..e66d913401641 100644
--- a/llvm/test/CodeGen/X86/vshift-6.ll
+++ b/llvm/test/CodeGen/X86/vshift-6.ll
@@ -36,29 +36,29 @@ define <16 x i8> @do_not_crash(i8*, i32*, i64*, i32, i64, i8) {
 ; X86-NEXT:    movdqa %xmm2, %xmm1
 ; X86-NEXT:    pandn %xmm0, %xmm1
 ; X86-NEXT:    por %xmm2, %xmm1
-; X86-NEXT:    pcmpeqd %xmm2, %xmm2
+; X86-NEXT:    pcmpeqd %xmm3, %xmm3
 ; X86-NEXT:    psllw $5, %xmm1
-; X86-NEXT:    pxor %xmm3, %xmm3
+; X86-NEXT:    pxor %xmm2, %xmm2
 ; X86-NEXT:    pxor %xmm0, %xmm0
 ; X86-NEXT:    pcmpgtb %xmm1, %xmm0
-; X86-NEXT:    pxor %xmm0, %xmm2
+; X86-NEXT:    pxor %xmm0, %xmm3
 ; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-NEXT:    por %xmm2, %xmm0
+; X86-NEXT:    por %xmm3, %xmm0
 ; X86-NEXT:    paddb %xmm1, %xmm1
-; X86-NEXT:    pxor %xmm2, %xmm2
-; X86-NEXT:    pcmpgtb %xmm1, %xmm2
-; X86-NEXT:    movdqa %xmm2, %xmm4
+; X86-NEXT:    pxor %xmm3, %xmm3
+; X86-NEXT:    pcmpgtb %xmm1, %xmm3
+; X86-NEXT:    movdqa %xmm3, %xmm4
 ; X86-NEXT:    pandn %xmm0, %xmm4
 ; X86-NEXT:    psllw $2, %xmm0
-; X86-NEXT:    pand %xmm2, %xmm0
+; X86-NEXT:    pand %xmm3, %xmm0
 ; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    por %xmm4, %xmm0
 ; X86-NEXT:    paddb %xmm1, %xmm1
-; X86-NEXT:    pcmpgtb %xmm1, %xmm3
-; X86-NEXT:    movdqa %xmm3, %xmm1
+; X86-NEXT:    pcmpgtb %xmm1, %xmm2
+; X86-NEXT:    movdqa %xmm2, %xmm1
 ; X86-NEXT:    pandn %xmm0, %xmm1
 ; X86-NEXT:    paddb %xmm0, %xmm0
-; X86-NEXT:    pand %xmm3, %xmm0
+; X86-NEXT:    pand %xmm2, %xmm0
 ; X86-NEXT:    por %xmm1, %xmm0
 ; X86-NEXT:    retl
 ;

diff  --git a/llvm/test/CodeGen/X86/widen_cast-4.ll b/llvm/test/CodeGen/X86/widen_cast-4.ll
index f317d4b5913c3..2d676bf24bd50 100644
--- a/llvm/test/CodeGen/X86/widen_cast-4.ll
+++ b/llvm/test/CodeGen/X86/widen_cast-4.ll
@@ -20,19 +20,19 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
 ; WIDE-NEXT:  # %bb.2: # %forbody
 ; WIDE-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; WIDE-NEXT:    movl (%esp), %eax
-; WIDE-NEXT:    leal (,%eax,8), %ecx
-; WIDE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIDE-NEXT:    addl %ecx, %edx
-; WIDE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; WIDE-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; WIDE-NEXT:    leal (,%eax,8), %edx
+; WIDE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIDE-NEXT:    addl %edx, %ecx
 ; WIDE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIDE-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; WIDE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIDE-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
 ; WIDE-NEXT:    psubb %xmm0, %xmm3
 ; WIDE-NEXT:    psrlw $2, %xmm3
 ; WIDE-NEXT:    pand %xmm1, %xmm3
 ; WIDE-NEXT:    pxor %xmm2, %xmm3
 ; WIDE-NEXT:    psubb %xmm2, %xmm3
-; WIDE-NEXT:    movq %xmm3, (%edx,%eax,8)
+; WIDE-NEXT:    movq %xmm3, (%ecx,%eax,8)
 ; WIDE-NEXT:    incl (%esp)
 ; WIDE-NEXT:    jmp .LBB0_1
 ; WIDE-NEXT:  .LBB0_3: # %afterfor

diff  --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index 2521dfda97ba6..d416b1a547815 100644
--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -214,64 +214,63 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, i64* %res) {
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    subl $8, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:    sarl $31, %edi
 ; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    imull %ecx, %esi
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    imull %edi, %esi
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %eax, %ebx
 ; WIN32-NEXT:    addl %esi, %edx
-; WIN32-NEXT:    movl %edi, %esi
-; WIN32-NEXT:    imull %edi, %ecx
-; WIN32-NEXT:    addl %edx, %ecx
+; WIN32-NEXT:    movl %ebp, %esi
+; WIN32-NEXT:    imull %ebp, %edi
+; WIN32-NEXT:    addl %edx, %edi
 ; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    movl %esi, %edi
-; WIN32-NEXT:    imull %ebx, %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl %esi, %ebp
+; WIN32-NEXT:    imull %ecx, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    addl %edi, %edx
-; WIN32-NEXT:    imull %ebx, %esi
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    addl %ebp, %edx
+; WIN32-NEXT:    imull %ecx, %esi
 ; WIN32-NEXT:    addl %edx, %esi
-; WIN32-NEXT:    addl %ebp, %eax
+; WIN32-NEXT:    addl %ebx, %eax
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    adcl %ecx, %esi
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    movl %ebx, %edi
+; WIN32-NEXT:    adcl %edi, %esi
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    movl %ecx, %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl %edx, %ebp
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    movl %edx, %ebx
 ; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    addl %ebx, %ecx
-; WIN32-NEXT:    adcl $0, %ebp
+; WIN32-NEXT:    addl %ebp, %ecx
+; WIN32-NEXT:    adcl $0, %ebx
 ; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    addl %ecx, %edi
-; WIN32-NEXT:    adcl %ebp, %ebx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    addl %ecx, %ebp
+; WIN32-NEXT:    adcl %ebx, %edi
 ; WIN32-NEXT:    setb %cl
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %ebx, %eax
+; WIN32-NEXT:    addl %edi, %eax
 ; WIN32-NEXT:    movzbl %cl, %ecx
 ; WIN32-NEXT:    adcl %ecx, %edx
 ; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    adcl %esi, %edx
-; WIN32-NEXT:    movl %edi, %ecx
+; WIN32-NEXT:    movl %ebp, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
 ; WIN32-NEXT:    xorl %ecx, %edx
 ; WIN32-NEXT:    xorl %eax, %ecx
 ; WIN32-NEXT:    orl %edx, %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %edi, 4(%eax)
+; WIN32-NEXT:    movl %ebp, 4(%eax)
 ; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; WIN32-NEXT:    movl %ecx, (%eax)
 ; WIN32-NEXT:    setne %al
@@ -473,25 +472,27 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, i64* %res) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    testl %esi, %esi
 ; WIN32-NEXT:    setne %dl
 ; WIN32-NEXT:    testl %eax, %eax
 ; WIN32-NEXT:    setne %bl
 ; WIN32-NEXT:    andb %dl, %bl
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    seto %cl
+; WIN32-NEXT:    seto %bh
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %ecx, %edx
 ; WIN32-NEXT:    movl %eax, %esi
 ; WIN32-NEXT:    seto %ch
-; WIN32-NEXT:    orb %cl, %ch
+; WIN32-NEXT:    orb %bh, %ch
 ; WIN32-NEXT:    addl %edi, %esi
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    addl %esi, %edx
 ; WIN32-NEXT:    setb %cl
 ; WIN32-NEXT:    orb %ch, %cl
@@ -605,30 +606,30 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    movl %edx, %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    movl %edx, %ebp
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    addl %esi, %ecx
-; WIN32-NEXT:    adcl $0, %ebp
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    addl %esi, %ebp
+; WIN32-NEXT:    adcl $0, %ecx
 ; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %edx, %esi
-; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    addl %ecx, %edi
-; WIN32-NEXT:    adcl %ebp, %esi
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    addl %ebp, %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    adcl %ecx, %edi
 ; WIN32-NEXT:    setb %cl
 ; WIN32-NEXT:    movl %ebp, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %esi, %eax
+; WIN32-NEXT:    addl %edi, %eax
 ; WIN32-NEXT:    movzbl %cl, %ecx
 ; WIN32-NEXT:    adcl %ecx, %edx
 ; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    adcl %ebx, %edx
-; WIN32-NEXT:    sarl $31, %edi
-; WIN32-NEXT:    xorl %edi, %edx
-; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    xorl %esi, %edx
+; WIN32-NEXT:    xorl %eax, %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    orl %edx, %edi
+; WIN32-NEXT:    orl %edx, %esi
 ; WIN32-NEXT:    jne LBB12_2
 ; WIN32-NEXT:  # %bb.1:
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -996,60 +997,59 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    pushl %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:    sarl $31, %edi
 ; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    imull %ecx, %esi
-; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    imull %edi, %esi
+; WIN32-NEXT:    mull %edi
 ; WIN32-NEXT:    movl %eax, %ebp
 ; WIN32-NEXT:    addl %esi, %edx
-; WIN32-NEXT:    movl %edi, %esi
-; WIN32-NEXT:    imull %edi, %ecx
-; WIN32-NEXT:    addl %edx, %ecx
-; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    movl %esi, %edi
+; WIN32-NEXT:    movl %ebx, %esi
 ; WIN32-NEXT:    imull %ebx, %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    movl %esi, %ebx
+; WIN32-NEXT:    imull %ecx, %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    addl %edi, %edx
-; WIN32-NEXT:    imull %ebx, %esi
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    addl %ebx, %edx
+; WIN32-NEXT:    imull %ecx, %esi
 ; WIN32-NEXT:    addl %edx, %esi
 ; WIN32-NEXT:    addl %ebp, %eax
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    adcl %ecx, %esi
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    movl %ebx, %edi
+; WIN32-NEXT:    adcl %edi, %esi
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    movl %ecx, %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    movl %edx, %ebx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl %edx, %ebp
 ; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    addl %ebp, %ecx
-; WIN32-NEXT:    adcl $0, %ebx
+; WIN32-NEXT:    addl %ebx, %ecx
+; WIN32-NEXT:    adcl $0, %ebp
 ; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %edx, %ebp
-; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    addl %ecx, %edi
-; WIN32-NEXT:    adcl %ebx, %ebp
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    movl %eax, %ebx
+; WIN32-NEXT:    addl %ecx, %ebx
+; WIN32-NEXT:    adcl %ebp, %edi
 ; WIN32-NEXT:    setb %cl
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %ebp, %eax
+; WIN32-NEXT:    addl %edi, %eax
 ; WIN32-NEXT:    movzbl %cl, %ecx
 ; WIN32-NEXT:    adcl %ecx, %edx
 ; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    adcl %esi, %edx
-; WIN32-NEXT:    sarl $31, %edi
-; WIN32-NEXT:    xorl %edi, %edx
-; WIN32-NEXT:    xorl %eax, %edi
-; WIN32-NEXT:    orl %edx, %edi
+; WIN32-NEXT:    sarl $31, %ebx
+; WIN32-NEXT:    xorl %ebx, %edx
+; WIN32-NEXT:    xorl %eax, %ebx
+; WIN32-NEXT:    orl %edx, %ebx
 ; WIN32-NEXT:    jne LBB18_1
 ; WIN32-NEXT:  # %bb.3: # %continue
 ; WIN32-NEXT:    movb $1, %al
@@ -1322,29 +1322,31 @@ define zeroext i1 @umulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    testl %esi, %esi
 ; WIN32-NEXT:    setne %dl
 ; WIN32-NEXT:    testl %eax, %eax
-; WIN32-NEXT:    setne %cl
-; WIN32-NEXT:    andb %dl, %cl
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    setne %bl
+; WIN32-NEXT:    andb %dl, %bl
+; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    seto %bl
+; WIN32-NEXT:    seto %bh
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %ecx, %edx
 ; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    seto %ch
-; WIN32-NEXT:    orb %bl, %ch
+; WIN32-NEXT:    seto %cl
+; WIN32-NEXT:    orb %bh, %cl
 ; WIN32-NEXT:    addl %edi, %esi
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    addl %esi, %edx
 ; WIN32-NEXT:    setb %al
-; WIN32-NEXT:    orb %ch, %al
 ; WIN32-NEXT:    orb %cl, %al
+; WIN32-NEXT:    orb %bl, %al
 ; WIN32-NEXT:    subb $1, %al
 ; WIN32-NEXT:    je LBB22_1
 ; WIN32-NEXT:  # %bb.3: # %continue
@@ -1724,8 +1726,7 @@ define zeroext i1 @smuloi64_load(i64* %ptr1, i64 %v2, i64* %res) {
 ; WIN32-NEXT:    imull %ecx, %edi
 ; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    addl %edi, %edx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    imull %edi, %ecx
+; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    addl %edx, %ecx
 ; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1742,27 +1743,26 @@ define zeroext i1 @smuloi64_load(i64* %ptr1, i64 %v2, i64* %res) {
 ; WIN32-NEXT:    addl %ebx, %edi
 ; WIN32-NEXT:    adcl $0, %ebp
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    mull %edx
-; WIN32-NEXT:    movl %edx, %esi
-; WIN32-NEXT:    movl %eax, %ebx
-; WIN32-NEXT:    addl %edi, %ebx
-; WIN32-NEXT:    adcl %ebp, %esi
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    adcl %ebp, %ebx
 ; WIN32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %esi, %eax
-; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; WIN32-NEXT:    adcl %esi, %edx
+; WIN32-NEXT:    addl %ebx, %eax
+; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
+; WIN32-NEXT:    adcl %edi, %edx
 ; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    adcl %ecx, %edx
-; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    movl %esi, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
 ; WIN32-NEXT:    xorl %ecx, %edx
 ; WIN32-NEXT:    xorl %eax, %ecx
 ; WIN32-NEXT:    orl %edx, %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %ebx, 4(%eax)
+; WIN32-NEXT:    movl %esi, 4(%eax)
 ; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; WIN32-NEXT:    movl %ecx, (%eax)
 ; WIN32-NEXT:    setne %al
@@ -1811,67 +1811,65 @@ define zeroext i1 @smuloi64_load2(i64 %v1, i64* %ptr2, i64* %res) {
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    subl $12, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl (%eax), %ebp
-; WIN32-NEXT:    movl 4(%eax), %edi
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    movl 4(%eax), %ebx
+; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:    sarl $31, %edi
 ; WIN32-NEXT:    movl %ebp, %esi
-; WIN32-NEXT:    imull %ecx, %esi
+; WIN32-NEXT:    imull %edi, %esi
 ; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    mull %edi
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    addl %esi, %edx
-; WIN32-NEXT:    movl %edi, %esi
-; WIN32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    imull %edi, %ecx
-; WIN32-NEXT:    addl %edx, %ecx
-; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    movl %esi, %edi
+; WIN32-NEXT:    movl %ebx, %esi
+; WIN32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    imull %ebx, %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    movl %esi, %ebx
+; WIN32-NEXT:    imull %ecx, %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    addl %edi, %edx
-; WIN32-NEXT:    imull %ebx, %esi
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    addl %ebx, %edx
+; WIN32-NEXT:    imull %ecx, %esi
 ; WIN32-NEXT:    addl %edx, %esi
 ; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    adcl %ecx, %esi
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    movl %ebx, %edi
+; WIN32-NEXT:    adcl %edi, %esi
+; WIN32-NEXT:    movl %ecx, %eax
 ; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl %edx, %edi
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    movl %edx, %ebx
 ; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    addl %ebx, %ecx
-; WIN32-NEXT:    adcl $0, %ebp
-; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    addl %edi, %ecx
+; WIN32-NEXT:    adcl $0, %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    addl %ecx, %edi
-; WIN32-NEXT:    adcl %ebp, %ebx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    addl %ecx, %ebp
+; WIN32-NEXT:    adcl %ebx, %edi
 ; WIN32-NEXT:    setb %cl
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; WIN32-NEXT:    addl %ebx, %eax
+; WIN32-NEXT:    addl %edi, %eax
 ; WIN32-NEXT:    movzbl %cl, %ecx
 ; WIN32-NEXT:    adcl %ecx, %edx
 ; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    adcl %esi, %edx
-; WIN32-NEXT:    movl %edi, %ecx
+; WIN32-NEXT:    movl %ebp, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
 ; WIN32-NEXT:    xorl %ecx, %edx
 ; WIN32-NEXT:    xorl %eax, %ecx
 ; WIN32-NEXT:    orl %edx, %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %edi, 4(%eax)
+; WIN32-NEXT:    movl %ebp, 4(%eax)
 ; WIN32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; WIN32-NEXT:    movl %ecx, (%eax)
 ; WIN32-NEXT:    setne %al
@@ -2232,11 +2230,11 @@ define zeroext i1 @umuloi64_load(i64* %ptr1, i64 %v2, i64* %res) {
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    pushl %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl (%eax), %esi
+; WIN32-NEXT:    movl (%eax), %ecx
 ; WIN32-NEXT:    movl 4(%eax), %eax
-; WIN32-NEXT:    testl %ecx, %ecx
+; WIN32-NEXT:    testl %esi, %esi
 ; WIN32-NEXT:    setne %dl
 ; WIN32-NEXT:    testl %eax, %eax
 ; WIN32-NEXT:    setne %bl
@@ -2244,15 +2242,15 @@ define zeroext i1 @umuloi64_load(i64* %ptr1, i64 %v2, i64* %res) {
 ; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    movl %eax, %edi
 ; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %eax, %esi
 ; WIN32-NEXT:    seto %bh
 ; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; WIN32-NEXT:    addl %edi, %ecx
-; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    movl %ecx, %eax
 ; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    addl %ecx, %edx
+; WIN32-NEXT:    addl %esi, %edx
 ; WIN32-NEXT:    setb %cl
 ; WIN32-NEXT:    orb %bh, %cl
 ; WIN32-NEXT:    orb %bl, %cl
@@ -2311,34 +2309,37 @@ define zeroext i1 @umuloi64_load2(i64 %v1, i64* %ptr2, i64* %res) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl (%edx), %ebp
+; WIN32-NEXT:    movl (%edx), %ecx
 ; WIN32-NEXT:    movl 4(%edx), %esi
 ; WIN32-NEXT:    testl %eax, %eax
 ; WIN32-NEXT:    setne %dl
 ; WIN32-NEXT:    testl %esi, %esi
 ; WIN32-NEXT:    setne %bl
 ; WIN32-NEXT:    andb %dl, %bl
-; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    seto %cl
+; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    seto %ch
-; WIN32-NEXT:    orb %cl, %ch
+; WIN32-NEXT:    seto %bh
+; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
 ; WIN32-NEXT:    addl %edi, %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    addl %esi, %edx
 ; WIN32-NEXT:    setb %cl
-; WIN32-NEXT:    orb %ch, %cl
+; WIN32-NEXT:    orb %bh, %cl
 ; WIN32-NEXT:    orb %bl, %cl
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movl %eax, (%esi)
 ; WIN32-NEXT:    movl %edx, 4(%esi)
 ; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx


        


More information about the llvm-commits mailing list