[llvm] 1bb4306 - PeepholeOpt: Allow introducing subregister uses on reg_sequence (#127052)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 21 18:16:19 PST 2025
Author: Matt Arsenault
Date: 2025-02-22T09:16:14+07:00
New Revision: 1bb43068f187141f20066a83675531161ed45051
URL: https://github.com/llvm/llvm-project/commit/1bb43068f187141f20066a83675531161ed45051
DIFF: https://github.com/llvm/llvm-project/commit/1bb43068f187141f20066a83675531161ed45051.diff
LOG: PeepholeOpt: Allow introducing subregister uses on reg_sequence (#127052)
This reverts d246cc618adc52fdbd69d44a2a375c8af97b6106. We now handle
composing subregister extracts through reg_sequence.
Added:
Modified:
llvm/lib/CodeGen/PeepholeOptimizer.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
llvm/test/CodeGen/AMDGPU/call-argument-types.ll
llvm/test/CodeGen/AMDGPU/calling-conventions.ll
llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
llvm/test/CodeGen/AMDGPU/ctpop64.ll
llvm/test/CodeGen/AMDGPU/div_v2i128.ll
llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
llvm/test/CodeGen/AMDGPU/fptrunc.ll
llvm/test/CodeGen/AMDGPU/function-args.ll
llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
llvm/test/CodeGen/AMDGPU/idot4s.ll
llvm/test/CodeGen/AMDGPU/idot4u.ll
llvm/test/CodeGen/AMDGPU/idot8u.ll
llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
llvm/test/CodeGen/AMDGPU/kernel-args.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
llvm/test/CodeGen/AMDGPU/llvm.exp.ll
llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
llvm/test/CodeGen/AMDGPU/load-global-i16.ll
llvm/test/CodeGen/AMDGPU/load-global-i32.ll
llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll
llvm/test/CodeGen/AMDGPU/mul.ll
llvm/test/CodeGen/AMDGPU/mul_int24.ll
llvm/test/CodeGen/AMDGPU/select.f16.ll
llvm/test/CodeGen/AMDGPU/shl.ll
llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
llvm/test/CodeGen/AMDGPU/spill-vgpr.ll
llvm/test/CodeGen/AMDGPU/sra.ll
llvm/test/CodeGen/AMDGPU/srl.ll
llvm/test/CodeGen/AMDGPU/udiv.ll
llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll
llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
llvm/test/CodeGen/Thumb2/mve-shuffle.ll
llvm/test/CodeGen/Thumb2/mve-vabdus.ll
llvm/test/CodeGen/Thumb2/mve-vld2.ll
llvm/test/CodeGen/Thumb2/mve-vld3.ll
llvm/test/CodeGen/Thumb2/mve-vld4.ll
llvm/test/CodeGen/Thumb2/mve-vldst4.ll
llvm/test/CodeGen/Thumb2/mve-vst2.ll
llvm/test/CodeGen/Thumb2/mve-vst3.ll
llvm/test/CodeGen/Thumb2/mve-vst4-post.ll
llvm/test/CodeGen/Thumb2/mve-vst4.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index 24bd9938bc45c..5416cdd39aaf3 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -421,12 +421,6 @@ class RegSequenceRewriter : public Rewriter {
}
bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override {
- // Do not introduce new subregister uses in a reg_sequence. Until composing
- // subregister indices is supported while folding, we're just blocking
- // folding of subregister copies later in the function.
- if (NewSubReg)
- return false;
-
MachineOperand &MO = CopyLike.getOperand(CurrentSrcIdx);
MO.setReg(NewReg);
MO.setSubReg(NewSubReg);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index d41601cc0d76e..40f29c56c8f12 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -1635,7 +1635,6 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, s11
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
@@ -1683,32 +1682,33 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT: v_add3_u32 v6, v3, v2, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v6, v[1:2]
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s10, v0
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s9, v5, v[1:2]
+; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v6, s11
+; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s10, v0
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2]
; GFX9-NEXT: v_mov_b32_e32 v4, s9
; GFX9-NEXT: s_ashr_i32 s10, s3, 31
-; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v7, v2, vcc
-; GFX9-NEXT: v_sub_u32_e32 v2, s11, v2
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0
-; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v1
-; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s8, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v7, v3, v7, s[0:1]
-; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v2, vcc
-; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5
-; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v6, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9
-; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8
+; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc
+; GFX9-NEXT: v_sub_u32_e32 v0, s11, v1
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7
+; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6
+; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s8, v7
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v2, s[0:1]
+; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc
+; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v5
+; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v9
-; GFX9-NEXT: v_cndmask_b32_e64 v12, v3, v12, s[0:1]
-; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10
+; GFX9-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v2
; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
; GFX9-NEXT: s_add_u32 s0, s18, s6
; GFX9-NEXT: s_addc_u32 s1, s19, s6
@@ -1716,116 +1716,116 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: s_mov_b32 s11, s10
; GFX9-NEXT: s_addc_u32 s3, s3, s10
; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11]
-; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s2
-; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
-; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f800000, v3
-; GFX9-NEXT: v_add_f32_e32 v3, v3, v15
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GFX9-NEXT: v_subrev_co_u32_e32 v15, vcc, s8, v8
-; GFX9-NEXT: v_subbrev_co_u32_e32 v16, vcc, 0, v2, vcc
-; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v3
-; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
-; GFX9-NEXT: v_trunc_f32_e32 v4, v3
-; GFX9-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4
-; GFX9-NEXT: v_add_f32_e32 v2, v3, v2
-; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
+; GFX9-NEXT: v_add_f32_e32 v1, v1, v15
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
+; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s8, v9
+; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
+; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; GFX9-NEXT: v_trunc_f32_e32 v16, v1
+; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v16
+; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v0
; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7]
; GFX9-NEXT: s_sub_u32 s5, 0, s2
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v17, 0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v4
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v2, v13, vcc
+; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v16
; GFX9-NEXT: s_subb_u32 s20, 0, s3
; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v12, v[3:4]
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7
-; GFX9-NEXT: v_mul_lo_u32 v7, v12, v2
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[18:19], s20, v17, v[3:4]
-; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v11, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v15, vcc
-; GFX9-NEXT: v_mul_lo_u32 v8, v17, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[0:1]
-; GFX9-NEXT: v_mul_hi_u32 v10, v17, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v10, v12, v3
-; GFX9-NEXT: v_mul_hi_u32 v2, v12, v2
-; GFX9-NEXT: v_add_u32_e32 v7, v8, v7
-; GFX9-NEXT: v_mul_hi_u32 v8, v17, v3
-; GFX9-NEXT: v_mul_hi_u32 v3, v12, v3
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v13, v[1:2]
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, v11, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v17, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v2, v13, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v15, vcc
+; GFX9-NEXT: v_mul_lo_u32 v3, v17, v1
+; GFX9-NEXT: v_mul_hi_u32 v10, v17, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v13, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v10
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v10, v13, v1
+; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT: v_mul_hi_u32 v3, v17, v1
+; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-NEXT: v_add_u32_e32 v8, v10, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v3, v8, v7, v3
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v17, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v12, v3, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v7, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v9, v1, v9, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_add_u32_e32 v3, v10, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v17, v0
+; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0
+; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v0, v3
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v8, v[0:1]
-; GFX9-NEXT: v_xor_b32_e32 v10, s17, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v4, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v11, v[0:1]
; GFX9-NEXT: v_xor_b32_e32 v5, s16, v5
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v7, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v11, s17
+; GFX9-NEXT: v_xor_b32_e32 v8, s17, v8
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v10, v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v9, s17
; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v5
-; GFX9-NEXT: v_xor_b32_e32 v4, s4, v6
-; GFX9-NEXT: v_mul_lo_u32 v5, v8, v2
-; GFX9-NEXT: v_mul_lo_u32 v6, v7, v3
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v10, v11, vcc
-; GFX9-NEXT: v_mul_hi_u32 v10, v7, v2
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v10
+; GFX9-NEXT: v_xor_b32_e32 v4, s4, v7
+; GFX9-NEXT: v_mul_lo_u32 v5, v11, v2
+; GFX9-NEXT: v_mul_lo_u32 v7, v10, v3
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v9, vcc
+; GFX9-NEXT: v_mul_hi_u32 v8, v10, v2
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7
+; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v10, v8, v3
-; GFX9-NEXT: v_mul_hi_u32 v2, v8, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
-; GFX9-NEXT: v_mul_hi_u32 v6, v7, v3
-; GFX9-NEXT: v_mul_hi_u32 v3, v8, v3
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v8, v11, v3
+; GFX9-NEXT: v_mul_hi_u32 v2, v11, v2
+; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
+; GFX9-NEXT: v_mul_hi_u32 v7, v10, v3
+; GFX9-NEXT: v_mul_hi_u32 v3, v11, v3
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7
+; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_add_u32_e32 v6, v10, v6
+; GFX9-NEXT: v_add_u32_e32 v7, v8, v7
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v3, v6, v5, v3
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
+; GFX9-NEXT: v_add3_u32 v3, v7, v5, v3
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc
; GFX9-NEXT: v_mul_lo_u32 v5, s9, v2
-; GFX9-NEXT: v_mul_lo_u32 v6, s8, v3
-; GFX9-NEXT: v_mul_hi_u32 v8, s8, v2
+; GFX9-NEXT: v_mul_lo_u32 v7, s8, v3
+; GFX9-NEXT: v_mul_hi_u32 v9, s8, v2
; GFX9-NEXT: v_mul_hi_u32 v2, s9, v2
; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7
+; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v8, s9, v3
-; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
-; GFX9-NEXT: v_mul_hi_u32 v6, s8, v3
-; GFX9-NEXT: v_xor_b32_e32 v9, s4, v9
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v9, s9, v3
+; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
+; GFX9-NEXT: v_mul_hi_u32 v7, s8, v3
+; GFX9-NEXT: v_xor_b32_e32 v6, s4, v6
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7
+; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v5
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0
+; GFX9-NEXT: v_mov_b32_e32 v8, s4
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX9-NEXT: v_add_u32_e32 v6, v8, v6
-; GFX9-NEXT: v_mov_b32_e32 v7, s4
; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v8, vcc
+; GFX9-NEXT: v_add_u32_e32 v6, v9, v7
; GFX9-NEXT: v_add3_u32 v8, v6, v11, v12
-; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v9, v7, vcc
; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v8, v[3:4]
; GFX9-NEXT: v_mov_b32_e32 v9, s9
; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v2
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 8bb8ecb079a34..bc89a186db010 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2218,31 +2218,31 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_clause 0x1
; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1264-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1264-NEXT: s_mov_b64 s[6:7], exec
-; GFX1264-NEXT: s_mov_b32 s11, 0
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1264-NEXT: s_mov_b64 s[8:9], exec
+; GFX1264-NEXT: s_mov_b32 s11, 0
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
+; GFX1264-NEXT: s_mov_b64 s[6:7], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1264-NEXT: s_cbranch_execz .LBB4_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[6:7]
-; GFX1264-NEXT: s_mov_b32 s15, 0x31016000
+; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9]
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mul_u64 s[6:7], s[4:5], s[10:11]
-; GFX1264-NEXT: s_mov_b32 s14, -1
+; GFX1264-NEXT: s_mul_u64 s[8:9], s[4:5], s[10:11]
+; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-NEXT: s_wait_alu 0xfffe
-; GFX1264-NEXT: v_mov_b32_e32 v0, s6
-; GFX1264-NEXT: v_mov_b32_e32 v1, s7
-; GFX1264-NEXT: s_mov_b32 s12, s2
-; GFX1264-NEXT: s_mov_b32 s13, s3
-; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1264-NEXT: v_mov_b32_e32 v0, s8
+; GFX1264-NEXT: v_mov_b32_e32 v1, s9
+; GFX1264-NEXT: s_mov_b32 s10, -1
+; GFX1264-NEXT: s_mov_b32 s8, s2
+; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB4_2:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX1264-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: v_readfirstlane_b32 s3, v1
; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
@@ -5800,31 +5800,31 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_clause 0x1
; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1264-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1264-NEXT: s_mov_b64 s[6:7], exec
-; GFX1264-NEXT: s_mov_b32 s11, 0
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1264-NEXT: s_mov_b64 s[8:9], exec
+; GFX1264-NEXT: s_mov_b32 s11, 0
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
+; GFX1264-NEXT: s_mov_b64 s[6:7], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1264-NEXT: s_cbranch_execz .LBB10_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[6:7]
-; GFX1264-NEXT: s_mov_b32 s15, 0x31016000
+; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9]
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mul_u64 s[6:7], s[4:5], s[10:11]
-; GFX1264-NEXT: s_mov_b32 s14, -1
+; GFX1264-NEXT: s_mul_u64 s[8:9], s[4:5], s[10:11]
+; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-NEXT: s_wait_alu 0xfffe
-; GFX1264-NEXT: v_mov_b32_e32 v0, s6
-; GFX1264-NEXT: v_mov_b32_e32 v1, s7
-; GFX1264-NEXT: s_mov_b32 s12, s2
-; GFX1264-NEXT: s_mov_b32 s13, s3
-; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1264-NEXT: v_mov_b32_e32 v0, s8
+; GFX1264-NEXT: v_mov_b32_e32 v1, s9
+; GFX1264-NEXT: s_mov_b32 s10, -1
+; GFX1264-NEXT: s_mov_b32 s8, s2
+; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB10_2:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX1264-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: v_mad_co_u64_u32 v[3:4], null, s4, v2, 0
; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 16fe85bf138b2..9bbecacd6c774 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -1147,12 +1147,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT: s_mov_b32 s38, -1
; VI-NEXT: s_mov_b32 s39, 0xe80000
-; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
-; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_add_u32 s36, s36, s3
+; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_mov_b32 s1, s0
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
@@ -1170,12 +1169,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT: s_mov_b32 s38, -1
; CI-NEXT: s_mov_b32 s39, 0xe8f000
-; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
-; CI-NEXT: s_mov_b32 s0, 0
; CI-NEXT: s_add_u32 s36, s36, s3
+; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT: s_mov_b64 s[0:1], 0
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
-; CI-NEXT: s_mov_b32 s1, s0
; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
@@ -1193,12 +1191,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_mov_b32 s38, -1
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: s_add_u32 s36, s36, s3
+; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_mov_b32 s1, s0
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
@@ -1212,10 +1209,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
;
; GFX11-LABEL: test_call_external_void_func_v2i64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_mov_b64 s[4:5], 0
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s5, s4
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+12
@@ -1229,11 +1225,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_mov_b32 s8, 0
; HSA-NEXT: s_add_u32 s0, s0, s9
+; HSA-NEXT: s_mov_b64 s[8:9], 0
; HSA-NEXT: s_mov_b32 s11, 0x1100f000
; HSA-NEXT: s_mov_b32 s10, -1
-; HSA-NEXT: s_mov_b32 s9, s8
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; HSA-NEXT: s_addc_u32 s1, s1, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
@@ -1357,12 +1352,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT: s_mov_b32 s38, -1
; VI-NEXT: s_mov_b32 s39, 0xe80000
-; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
-; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_add_u32 s36, s36, s3
+; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_mov_b32 s1, s0
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
@@ -1382,12 +1376,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT: s_mov_b32 s38, -1
; CI-NEXT: s_mov_b32 s39, 0xe8f000
-; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
-; CI-NEXT: s_mov_b32 s0, 0
; CI-NEXT: s_add_u32 s36, s36, s3
+; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT: s_mov_b64 s[0:1], 0
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
-; CI-NEXT: s_mov_b32 s1, s0
; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
@@ -1407,12 +1400,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_mov_b32 s38, -1
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: s_add_u32 s36, s36, s3
+; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_mov_b32 s1, s0
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
@@ -1428,10 +1420,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
;
; GFX11-LABEL: test_call_external_void_func_v3i64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_mov_b64 s[4:5], 0
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s5, s4
; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 2
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
@@ -1446,11 +1437,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_mov_b32 s8, 0
; HSA-NEXT: s_add_u32 s0, s0, s9
+; HSA-NEXT: s_mov_b64 s[8:9], 0
; HSA-NEXT: s_mov_b32 s11, 0x1100f000
; HSA-NEXT: s_mov_b32 s10, -1
-; HSA-NEXT: s_mov_b32 s9, s8
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; HSA-NEXT: s_addc_u32 s1, s1, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
@@ -1477,12 +1467,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT: s_mov_b32 s38, -1
; VI-NEXT: s_mov_b32 s39, 0xe80000
-; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
-; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_add_u32 s36, s36, s3
+; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_mov_b32 s1, s0
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
@@ -1504,12 +1493,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT: s_mov_b32 s38, -1
; CI-NEXT: s_mov_b32 s39, 0xe8f000
-; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
-; CI-NEXT: s_mov_b32 s0, 0
; CI-NEXT: s_add_u32 s36, s36, s3
+; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT: s_mov_b64 s[0:1], 0
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
-; CI-NEXT: s_mov_b32 s1, s0
; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
@@ -1531,12 +1519,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_mov_b32 s38, -1
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: s_add_u32 s36, s36, s3
+; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_mov_b32 s1, s0
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
@@ -1554,10 +1541,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
;
; GFX11-LABEL: test_call_external_void_func_v4i64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_mov_b64 s[4:5], 0
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s5, s4
; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 2
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0
; GFX11-NEXT: v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4
@@ -1573,11 +1559,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_mov_b32 s8, 0
; HSA-NEXT: s_add_u32 s0, s0, s9
+; HSA-NEXT: s_mov_b64 s[8:9], 0
; HSA-NEXT: s_mov_b32 s11, 0x1100f000
; HSA-NEXT: s_mov_b32 s10, -1
-; HSA-NEXT: s_mov_b32 s9, s8
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; HSA-NEXT: s_addc_u32 s1, s1, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 56ecfa298a348..1e740b79dcdbb 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -1028,18 +1028,17 @@ entry:
define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) {
; SI-LABEL: amd_kernel_v2i8:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dword s1, s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: s_load_dword s2, s[4:5], 0x9
+; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_bfe_u32 s2, s1, 0x80008
-; SI-NEXT: s_add_i32 s1, s1, s1
-; SI-NEXT: s_and_b32 s1, s1, 0xff
+; SI-NEXT: s_bfe_u32 s4, s2, 0x80008
; SI-NEXT: s_add_i32 s2, s2, s2
-; SI-NEXT: s_lshl_b32 s2, s2, 8
-; SI-NEXT: s_or_b32 s4, s1, s2
+; SI-NEXT: s_and_b32 s2, s2, 0xff
+; SI-NEXT: s_add_i32 s4, s4, s4
+; SI-NEXT: s_lshl_b32 s4, s4, 8
+; SI-NEXT: s_or_b32 s4, s2, s4
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s1, s0
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -1084,28 +1083,27 @@ entry:
define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) {
; SI-LABEL: amd_kernel_v4i8:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dword s1, s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: s_load_dword s2, s[4:5], 0x9
+; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_lshr_b32 s2, s1, 16
-; SI-NEXT: s_lshr_b32 s4, s1, 24
-; SI-NEXT: s_bfe_u32 s5, s1, 0x80008
-; SI-NEXT: s_add_i32 s1, s1, s1
-; SI-NEXT: s_add_i32 s4, s4, s4
+; SI-NEXT: s_lshr_b32 s4, s2, 16
+; SI-NEXT: s_lshr_b32 s5, s2, 24
+; SI-NEXT: s_bfe_u32 s6, s2, 0x80008
; SI-NEXT: s_add_i32 s2, s2, s2
-; SI-NEXT: s_and_b32 s1, s1, 0xff
; SI-NEXT: s_add_i32 s5, s5, s5
-; SI-NEXT: s_lshl_b32 s4, s4, 24
+; SI-NEXT: s_add_i32 s4, s4, s4
; SI-NEXT: s_and_b32 s2, s2, 0xff
-; SI-NEXT: s_lshl_b32 s5, s5, 8
-; SI-NEXT: s_lshl_b32 s2, s2, 16
-; SI-NEXT: s_or_b32 s1, s1, s5
-; SI-NEXT: s_or_b32 s2, s4, s2
-; SI-NEXT: s_and_b32 s1, s1, 0xffff
-; SI-NEXT: s_or_b32 s4, s1, s2
+; SI-NEXT: s_add_i32 s6, s6, s6
+; SI-NEXT: s_lshl_b32 s5, s5, 24
+; SI-NEXT: s_and_b32 s4, s4, 0xff
+; SI-NEXT: s_lshl_b32 s6, s6, 8
+; SI-NEXT: s_lshl_b32 s4, s4, 16
+; SI-NEXT: s_or_b32 s2, s2, s6
+; SI-NEXT: s_or_b32 s4, s5, s4
+; SI-NEXT: s_and_b32 s2, s2, 0xffff
+; SI-NEXT: s_or_b32 s4, s2, s4
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s1, s0
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -1171,8 +1169,7 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) {
; SI-LABEL: amd_kernel_v3i8:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s4, s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s1, 0
-; SI-NEXT: s_mov_b32 s0, 2
+; SI-NEXT: s_mov_b64 s[0:1], 2
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -1184,9 +1181,9 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) {
; SI-NEXT: s_add_i32 s5, s5, s5
; SI-NEXT: s_lshl_b32 s6, s6, 8
; SI-NEXT: v_mov_b32_e32 v0, s5
-; SI-NEXT: s_or_b32 s4, s4, s6
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
-; SI-NEXT: s_mov_b32 s0, s1
+; SI-NEXT: s_or_b32 s4, s4, s6
+; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -1244,8 +1241,7 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) {
; SI-LABEL: amd_kernel_v5i8:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s1, 0
-; SI-NEXT: s_mov_b32 s0, 4
+; SI-NEXT: s_mov_b64 s[0:1], 4
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -1259,16 +1255,16 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) {
; SI-NEXT: s_and_b32 s4, s4, 0xff
; SI-NEXT: s_add_i32 s8, s8, s8
; SI-NEXT: v_mov_b32_e32 v0, s5
-; SI-NEXT: s_lshl_b32 s5, s7, 24
-; SI-NEXT: s_and_b32 s6, s6, 0xff
-; SI-NEXT: s_lshl_b32 s7, s8, 8
-; SI-NEXT: s_lshl_b32 s6, s6, 16
-; SI-NEXT: s_or_b32 s4, s4, s7
-; SI-NEXT: s_or_b32 s5, s5, s6
-; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
-; SI-NEXT: s_mov_b32 s0, s1
+; SI-NEXT: s_lshl_b32 s0, s7, 24
+; SI-NEXT: s_and_b32 s1, s6, 0xff
+; SI-NEXT: s_lshl_b32 s5, s8, 8
+; SI-NEXT: s_lshl_b32 s1, s1, 16
+; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: s_or_b32 s0, s0, s1
+; SI-NEXT: s_and_b32 s1, s4, 0xffff
+; SI-NEXT: s_or_b32 s4, s1, s0
+; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -1345,45 +1341,44 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) {
; SI-LABEL: amd_kernel_v8i8:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_lshr_b32 s1, s4, 16
-; SI-NEXT: s_lshr_b32 s2, s4, 24
-; SI-NEXT: s_lshr_b32 s6, s5, 16
-; SI-NEXT: s_lshr_b32 s7, s5, 24
-; SI-NEXT: s_bfe_u32 s8, s4, 0x80008
-; SI-NEXT: s_bfe_u32 s9, s5, 0x80008
+; SI-NEXT: s_lshr_b32 s2, s4, 16
+; SI-NEXT: s_lshr_b32 s6, s4, 24
+; SI-NEXT: s_lshr_b32 s7, s5, 16
+; SI-NEXT: s_lshr_b32 s8, s5, 24
+; SI-NEXT: s_bfe_u32 s9, s4, 0x80008
+; SI-NEXT: s_bfe_u32 s10, s5, 0x80008
; SI-NEXT: s_add_i32 s5, s5, s5
; SI-NEXT: s_add_i32 s4, s4, s4
+; SI-NEXT: s_add_i32 s8, s8, s8
; SI-NEXT: s_add_i32 s7, s7, s7
-; SI-NEXT: s_add_i32 s6, s6, s6
; SI-NEXT: s_and_b32 s5, s5, 0xff
-; SI-NEXT: s_add_i32 s9, s9, s9
+; SI-NEXT: s_add_i32 s10, s10, s10
+; SI-NEXT: s_add_i32 s6, s6, s6
; SI-NEXT: s_add_i32 s2, s2, s2
-; SI-NEXT: s_add_i32 s1, s1, s1
; SI-NEXT: s_and_b32 s4, s4, 0xff
-; SI-NEXT: s_add_i32 s8, s8, s8
-; SI-NEXT: s_lshl_b32 s7, s7, 24
-; SI-NEXT: s_and_b32 s6, s6, 0xff
+; SI-NEXT: s_add_i32 s9, s9, s9
+; SI-NEXT: s_lshl_b32 s8, s8, 24
+; SI-NEXT: s_and_b32 s7, s7, 0xff
+; SI-NEXT: s_lshl_b32 s10, s10, 8
+; SI-NEXT: s_lshl_b32 s6, s6, 24
+; SI-NEXT: s_and_b32 s2, s2, 0xff
; SI-NEXT: s_lshl_b32 s9, s9, 8
-; SI-NEXT: s_lshl_b32 s2, s2, 24
-; SI-NEXT: s_and_b32 s1, s1, 0xff
-; SI-NEXT: s_lshl_b32 s8, s8, 8
-; SI-NEXT: s_lshl_b32 s6, s6, 16
-; SI-NEXT: s_or_b32 s5, s5, s9
-; SI-NEXT: s_lshl_b32 s1, s1, 16
-; SI-NEXT: s_or_b32 s4, s4, s8
-; SI-NEXT: s_or_b32 s6, s7, s6
+; SI-NEXT: s_lshl_b32 s7, s7, 16
+; SI-NEXT: s_or_b32 s5, s5, s10
+; SI-NEXT: s_lshl_b32 s2, s2, 16
+; SI-NEXT: s_or_b32 s4, s4, s9
+; SI-NEXT: s_or_b32 s7, s8, s7
; SI-NEXT: s_and_b32 s5, s5, 0xffff
-; SI-NEXT: s_or_b32 s1, s2, s1
-; SI-NEXT: s_and_b32 s2, s4, 0xffff
-; SI-NEXT: s_or_b32 s4, s5, s6
-; SI-NEXT: s_or_b32 s5, s2, s1
+; SI-NEXT: s_or_b32 s2, s6, s2
+; SI-NEXT: s_and_b32 s4, s4, 0xffff
+; SI-NEXT: s_or_b32 s5, s5, s7
+; SI-NEXT: s_or_b32 s4, s4, s2
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s1, s0
-; SI-NEXT: v_mov_b32_e32 v0, s5
-; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -1481,75 +1476,74 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) {
; SI-LABEL: amd_kernel_v16i8:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s4, 0
+; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_lshr_b32 s5, s0, 16
-; SI-NEXT: s_lshr_b32 s6, s0, 24
-; SI-NEXT: s_lshr_b32 s8, s1, 16
-; SI-NEXT: s_lshr_b32 s9, s1, 24
-; SI-NEXT: s_lshr_b32 s10, s2, 16
-; SI-NEXT: s_lshr_b32 s11, s2, 24
-; SI-NEXT: s_lshr_b32 s12, s3, 16
-; SI-NEXT: s_lshr_b32 s13, s3, 24
-; SI-NEXT: s_bfe_u32 s14, s0, 0x80008
-; SI-NEXT: s_bfe_u32 s15, s1, 0x80008
-; SI-NEXT: s_bfe_u32 s16, s2, 0x80008
-; SI-NEXT: s_bfe_u32 s17, s3, 0x80008
+; SI-NEXT: s_lshr_b32 s6, s0, 16
+; SI-NEXT: s_lshr_b32 s8, s0, 24
+; SI-NEXT: s_lshr_b32 s9, s1, 16
+; SI-NEXT: s_lshr_b32 s10, s1, 24
+; SI-NEXT: s_lshr_b32 s11, s2, 16
+; SI-NEXT: s_lshr_b32 s12, s2, 24
+; SI-NEXT: s_lshr_b32 s13, s3, 16
+; SI-NEXT: s_lshr_b32 s14, s3, 24
+; SI-NEXT: s_bfe_u32 s15, s0, 0x80008
+; SI-NEXT: s_bfe_u32 s16, s1, 0x80008
+; SI-NEXT: s_bfe_u32 s17, s2, 0x80008
+; SI-NEXT: s_bfe_u32 s18, s3, 0x80008
; SI-NEXT: s_add_i32 s3, s3, s3
; SI-NEXT: s_add_i32 s2, s2, s2
; SI-NEXT: s_add_i32 s1, s1, s1
; SI-NEXT: s_add_i32 s0, s0, s0
+; SI-NEXT: s_add_i32 s14, s14, s14
; SI-NEXT: s_add_i32 s13, s13, s13
-; SI-NEXT: s_add_i32 s12, s12, s12
; SI-NEXT: s_and_b32 s3, s3, 0xff
-; SI-NEXT: s_add_i32 s17, s17, s17
+; SI-NEXT: s_add_i32 s18, s18, s18
+; SI-NEXT: s_add_i32 s12, s12, s12
; SI-NEXT: s_add_i32 s11, s11, s11
-; SI-NEXT: s_add_i32 s10, s10, s10
; SI-NEXT: s_and_b32 s2, s2, 0xff
-; SI-NEXT: s_add_i32 s16, s16, s16
+; SI-NEXT: s_add_i32 s17, s17, s17
+; SI-NEXT: s_add_i32 s10, s10, s10
; SI-NEXT: s_add_i32 s9, s9, s9
-; SI-NEXT: s_add_i32 s8, s8, s8
; SI-NEXT: s_and_b32 s1, s1, 0xff
-; SI-NEXT: s_add_i32 s15, s15, s15
+; SI-NEXT: s_add_i32 s16, s16, s16
+; SI-NEXT: s_add_i32 s8, s8, s8
; SI-NEXT: s_add_i32 s6, s6, s6
-; SI-NEXT: s_add_i32 s5, s5, s5
; SI-NEXT: s_and_b32 s0, s0, 0xff
-; SI-NEXT: s_add_i32 s14, s14, s14
-; SI-NEXT: s_lshl_b32 s13, s13, 24
-; SI-NEXT: s_and_b32 s12, s12, 0xff
+; SI-NEXT: s_add_i32 s15, s15, s15
+; SI-NEXT: s_lshl_b32 s14, s14, 24
+; SI-NEXT: s_and_b32 s13, s13, 0xff
+; SI-NEXT: s_lshl_b32 s18, s18, 8
+; SI-NEXT: s_lshl_b32 s12, s12, 24
+; SI-NEXT: s_and_b32 s11, s11, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 8
-; SI-NEXT: s_lshl_b32 s11, s11, 24
-; SI-NEXT: s_and_b32 s10, s10, 0xff
+; SI-NEXT: s_lshl_b32 s10, s10, 24
+; SI-NEXT: s_and_b32 s9, s9, 0xff
; SI-NEXT: s_lshl_b32 s16, s16, 8
-; SI-NEXT: s_lshl_b32 s9, s9, 24
-; SI-NEXT: s_and_b32 s8, s8, 0xff
+; SI-NEXT: s_lshl_b32 s8, s8, 24
+; SI-NEXT: s_and_b32 s6, s6, 0xff
; SI-NEXT: s_lshl_b32 s15, s15, 8
-; SI-NEXT: s_lshl_b32 s6, s6, 24
-; SI-NEXT: s_and_b32 s5, s5, 0xff
-; SI-NEXT: s_lshl_b32 s14, s14, 8
-; SI-NEXT: s_lshl_b32 s12, s12, 16
-; SI-NEXT: s_or_b32 s3, s3, s17
-; SI-NEXT: s_lshl_b32 s10, s10, 16
-; SI-NEXT: s_or_b32 s2, s2, s16
-; SI-NEXT: s_lshl_b32 s8, s8, 16
-; SI-NEXT: s_or_b32 s1, s1, s15
-; SI-NEXT: s_lshl_b32 s5, s5, 16
-; SI-NEXT: s_or_b32 s0, s0, s14
-; SI-NEXT: s_or_b32 s12, s13, s12
+; SI-NEXT: s_lshl_b32 s13, s13, 16
+; SI-NEXT: s_or_b32 s3, s3, s18
+; SI-NEXT: s_lshl_b32 s11, s11, 16
+; SI-NEXT: s_or_b32 s2, s2, s17
+; SI-NEXT: s_lshl_b32 s9, s9, 16
+; SI-NEXT: s_or_b32 s1, s1, s16
+; SI-NEXT: s_lshl_b32 s6, s6, 16
+; SI-NEXT: s_or_b32 s0, s0, s15
+; SI-NEXT: s_or_b32 s13, s14, s13
; SI-NEXT: s_and_b32 s3, s3, 0xffff
-; SI-NEXT: s_or_b32 s10, s11, s10
+; SI-NEXT: s_or_b32 s11, s12, s11
; SI-NEXT: s_and_b32 s2, s2, 0xffff
-; SI-NEXT: s_or_b32 s8, s9, s8
+; SI-NEXT: s_or_b32 s9, s10, s9
; SI-NEXT: s_and_b32 s1, s1, 0xffff
-; SI-NEXT: s_or_b32 s5, s6, s5
+; SI-NEXT: s_or_b32 s6, s8, s6
; SI-NEXT: s_and_b32 s0, s0, 0xffff
-; SI-NEXT: s_or_b32 s3, s3, s12
-; SI-NEXT: s_or_b32 s2, s2, s10
-; SI-NEXT: s_or_b32 s1, s1, s8
-; SI-NEXT: s_or_b32 s0, s0, s5
+; SI-NEXT: s_or_b32 s3, s3, s13
+; SI-NEXT: s_or_b32 s2, s2, s11
+; SI-NEXT: s_or_b32 s1, s1, s9
+; SI-NEXT: s_or_b32 s0, s0, s6
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s5, s4
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mov_b32_e32 v2, s2
@@ -1717,8 +1711,7 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
; SI-LABEL: amd_kernel_v32i8:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s9, 0
-; SI-NEXT: s_mov_b32 s8, 16
+; SI-NEXT: s_mov_b64 s[8:9], 16
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -1860,7 +1853,7 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: v_mov_b32_e32 v3, s3
-; SI-NEXT: s_mov_b32 s8, s9
+; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; SI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
index cc2f775ff22bc..7a576b1c58c55 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -163,33 +163,33 @@ define amdgpu_kernel void @test_copy_v4i8_x3(ptr addrspace(1) %out0, ptr addrspa
define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3, ptr addrspace(1) %in) nounwind {
; SI-LABEL: test_copy_v4i8_x4:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11
-; SI-NEXT: s_mov_b32 s11, 0xf000
-; SI-NEXT: s_mov_b32 s2, 0
-; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x11
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
-; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s10, -1
-; SI-NEXT: s_mov_b32 s14, s10
-; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; SI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s14, s2
+; SI-NEXT: s_mov_b32 s15, s3
+; SI-NEXT: s_mov_b32 s18, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s8, s0
-; SI-NEXT: s_mov_b32 s9, s1
-; SI-NEXT: s_mov_b32 s19, s11
-; SI-NEXT: s_mov_b32 s22, s10
-; SI-NEXT: s_mov_b32 s23, s11
-; SI-NEXT: s_mov_b32 s12, s2
-; SI-NEXT: s_mov_b32 s13, s3
-; SI-NEXT: s_mov_b32 s16, s4
-; SI-NEXT: s_mov_b32 s17, s5
-; SI-NEXT: s_mov_b32 s20, s6
-; SI-NEXT: s_mov_b32 s21, s7
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_mov_b32 s19, s3
+; SI-NEXT: s_mov_b32 s22, s2
+; SI-NEXT: s_mov_b32 s23, s3
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: s_mov_b32 s16, s8
+; SI-NEXT: s_mov_b32 s17, s9
+; SI-NEXT: s_mov_b32 s20, s10
+; SI-NEXT: s_mov_b32 s21, s11
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0
; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0
; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0
@@ -325,23 +325,23 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr
define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %in) nounwind {
; SI-LABEL: test_copy_v4i8_x2_extra_use:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s14, 0
-; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s15, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
+; SI-NEXT: s_mov_b64 s[12:13], s[10:11]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
-; SI-NEXT: s_mov_b32 s10, -1
-; SI-NEXT: s_mov_b32 s14, s10
-; SI-NEXT: s_mov_b32 s8, s0
-; SI-NEXT: s_mov_b32 s9, s1
-; SI-NEXT: s_mov_b32 s12, s2
-; SI-NEXT: s_mov_b32 s13, s3
-; SI-NEXT: s_mov_b32 s6, s10
-; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s14, s2
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0
@@ -357,9 +357,9 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, p
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1
-; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: buffer_store_dword v1, off, s[12:15], 0
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_copy_v4i8_x2_extra_use:
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index 2258f6a7b5483..3504546801c93 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -334,58 +334,58 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr
define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %ctpop_arg, i32 %cond) {
; SI-LABEL: ctpop_i64_in_br:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dword s6, s[4:5], 0xf
+; SI-NEXT: s_load_dword s8, s[4:5], 0xf
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_cmp_lg_u32 s6, 0
+; SI-NEXT: s_cmp_lg_u32 s8, 0
; SI-NEXT: s_cbranch_scc0 .LBB7_4
; SI-NEXT: ; %bb.1: ; %else
-; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x2
+; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2
; SI-NEXT: s_mov_b64 s[2:3], 0
; SI-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
; SI-NEXT: s_cbranch_vccnz .LBB7_3
; SI-NEXT: .LBB7_2: ; %if
-; SI-NEXT: s_bcnt1_i32_b64 s6, s[4:5]
-; SI-NEXT: s_mov_b32 s7, 0
+; SI-NEXT: s_bcnt1_i32_b64 s4, s[6:7]
+; SI-NEXT: s_mov_b32 s5, 0
; SI-NEXT: .LBB7_3: ; %endif
-; SI-NEXT: v_mov_b32_e32 v0, s6
+; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
; SI-NEXT: .LBB7_4:
-; SI-NEXT: ; implicit-def: $sgpr6_sgpr7
+; SI-NEXT: ; implicit-def: $sgpr4_sgpr5
; SI-NEXT: s_branch .LBB7_2
;
; VI-LABEL: ctpop_i64_in_br:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s6, s[4:5], 0x3c
+; VI-NEXT: s_load_dword s8, s[4:5], 0x3c
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u32 s6, 0
+; VI-NEXT: s_cmp_lg_u32 s8, 0
; VI-NEXT: s_cbranch_scc0 .LBB7_4
; VI-NEXT: ; %bb.1: ; %else
-; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x8
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8
; VI-NEXT: s_cbranch_execnz .LBB7_3
; VI-NEXT: .LBB7_2: ; %if
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bcnt1_i32_b64 s6, s[4:5]
-; VI-NEXT: s_mov_b32 s7, 0
+; VI-NEXT: s_bcnt1_i32_b64 s4, s[6:7]
+; VI-NEXT: s_mov_b32 s5, 0
; VI-NEXT: .LBB7_3: ; %endif
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB7_4:
-; VI-NEXT: ; implicit-def: $sgpr6_sgpr7
+; VI-NEXT: ; implicit-def: $sgpr4_sgpr5
; VI-NEXT: s_branch .LBB7_2
entry:
%tmp0 = icmp eq i32 %cond, 0
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 41999b249a0e8..8b1d803b32aa6 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -1633,7 +1633,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_and_b32_e32 v8, 1, v8
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v35, v1, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v34, v1, 0, s[4:5]
; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v27, v17, 0, s[4:5]
@@ -1750,7 +1750,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v21
; SDAG-NEXT: v_or_b32_e32 v10, v10, v22
; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
-; SDAG-NEXT: v_or_b32_e32 v35, v19, v11
+; SDAG-NEXT: v_or_b32_e32 v34, v19, v11
; SDAG-NEXT: v_or_b32_e32 v32, v18, v10
; SDAG-NEXT: v_or_b32_e32 v27, v9, v21
; SDAG-NEXT: v_or_b32_e32 v33, v8, v20
@@ -1760,7 +1760,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_sub_i32_e32 v8, vcc, 0, v4
; SDAG-NEXT: v_mov_b32_e32 v18, 0
; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
-; SDAG-NEXT: v_mov_b32_e32 v34, v26
+; SDAG-NEXT: v_mov_b32_e32 v35, v26
; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc
; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v6, vcc
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
@@ -1954,8 +1954,8 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mul_lo_u32 v12, v33, v3
; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v33, v2, 0
; SDAG-NEXT: v_mul_lo_u32 v24, v27, v2
-; SDAG-NEXT: v_mul_lo_u32 v25, v35, v31
-; SDAG-NEXT: v_mul_lo_u32 v35, v32, v30
+; SDAG-NEXT: v_mul_lo_u32 v25, v34, v31
+; SDAG-NEXT: v_mul_lo_u32 v34, v32, v30
; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v31, v33, 0
; SDAG-NEXT: v_mov_b32_e32 v13, 0
; SDAG-NEXT: v_mul_lo_u32 v38, v14, v7
@@ -1987,7 +1987,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v48, v3
; SDAG-NEXT: v_mov_b32_e32 v12, v16
; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v37, v15, v[12:13]
-; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], v35, v7
+; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], v34, v7
; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v49, v3
; SDAG-NEXT: v_add_i32_e64 v12, s[4:5], v17, v12
; SDAG-NEXT: v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5]
@@ -2008,14 +2008,14 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v11, vcc
; SDAG-NEXT: v_xor_b32_e32 v6, v6, v26
; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v10, vcc
-; SDAG-NEXT: v_xor_b32_e32 v7, v7, v34
+; SDAG-NEXT: v_xor_b32_e32 v7, v7, v35
; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v12, vcc
; SDAG-NEXT: v_xor_b32_e32 v8, v4, v26
-; SDAG-NEXT: v_xor_b32_e32 v9, v5, v34
+; SDAG-NEXT: v_xor_b32_e32 v9, v5, v35
; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v6, v26
-; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v7, v34, vcc
+; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v7, v35, vcc
; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v8, v26, vcc
-; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v34, vcc
+; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v35, vcc
; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
index 8100dc522fd97..30fe881d41367 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
@@ -60,13 +60,12 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v
; CI-NEXT: s_mov_b64 vcc, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s0
-; CI-NEXT: s_mov_b32 s0, 0
-; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1
; CI-NEXT: v_mov_b32_e32 v2, 0x7b
+; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1
; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_mov_b64 s[0:1], 0
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
-; CI-NEXT: s_mov_b32 s1, s0
; CI-NEXT: ds_write_b32 v0, v2 offset:12
; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -143,10 +142,9 @@ define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_div_fmas_f32 v0, v0, v0, v0
-; CI-NEXT: s_mov_b32 s0, 0
+; CI-NEXT: s_mov_b64 s[0:1], 0
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
-; CI-NEXT: s_mov_b32 s1, s0
; CI-NEXT: ds_write_b32 v2, v1
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -533,14 +531,13 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_
; CI-NEXT: s_mov_b64 vcc, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s0
-; CI-NEXT: s_mov_b32 s0, 0
-; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1
; CI-NEXT: v_mov_b32_e32 v2, 0x7b
+; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1
; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_mov_b64 s[0:1], 0
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
-; CI-NEXT: s_mov_b32 s1, s0
; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index 02524bf71b074..37fc59c664a24 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -5894,48 +5894,52 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execz .LBB30_4
-; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT: s_cbranch_execnz .LBB30_3
+; GFX908-NEXT: ; %bb.1: ; %Flow3
+; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB30_6
+; GFX908-NEXT: .LBB30_2: ; %atomicrmw.phi
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+; GFX908-NEXT: .LBB30_3: ; %atomicrmw.global
+; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: .LBB30_2: ; %atomicrmw.start
+; GFX908-NEXT: .LBB30_4: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT: v_mov_b32_e32 v9, v1
+; GFX908-NEXT: v_mov_b32_e32 v8, v0
+; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB30_2
-; GFX908-NEXT: ; %bb.3: ; %Flow
+; GFX908-NEXT: s_cbranch_execnz .LBB30_4
+; GFX908-NEXT: ; %bb.5: ; %Flow
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX908-NEXT: .LBB30_4: ; %Flow3
; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT: s_cbranch_execz .LBB30_6
-; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private
-; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; GFX908-NEXT: s_cbranch_execz .LBB30_2
+; GFX908-NEXT: .LBB30_6: ; %atomicrmw.private
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
+; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_add_f64 v[0:1], v[4:5], v[2:3]
-; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
-; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
-; GFX908-NEXT: .LBB30_6: ; %atomicrmw.phi
+; GFX908-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3]
+; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
-; GFX908-NEXT: v_mov_b32_e32 v1, v5
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index 6ead5b93a0e39..5ae54926c4eab 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -5384,50 +5384,54 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execz .LBB24_4
-; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX90A-NEXT: s_cbranch_execnz .LBB24_3
+; GFX90A-NEXT: ; %bb.1: ; %Flow2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB24_6
+; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.phi
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+; GFX90A-NEXT: .LBB24_3: ; %atomicrmw.global
+; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB24_4: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX90A-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB24_2
-; GFX90A-NEXT: ; %bb.3: ; %Flow
+; GFX90A-NEXT: s_cbranch_execnz .LBB24_4
+; GFX90A-NEXT: ; %bb.5: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX90A-NEXT: .LBB24_4: ; %Flow2
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT: s_cbranch_execz .LBB24_6
-; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_cbranch_execz .LBB24_2
+; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.private
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi
+; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index 1fc9ed70e009c..99aeb8fe1f80e 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -5384,50 +5384,54 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execz .LBB24_4
-; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX90A-NEXT: s_cbranch_execnz .LBB24_3
+; GFX90A-NEXT: ; %bb.1: ; %Flow2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB24_6
+; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.phi
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+; GFX90A-NEXT: .LBB24_3: ; %atomicrmw.global
+; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB24_4: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX90A-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX90A-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB24_2
-; GFX90A-NEXT: ; %bb.3: ; %Flow
+; GFX90A-NEXT: s_cbranch_execnz .LBB24_4
+; GFX90A-NEXT: ; %bb.5: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX90A-NEXT: .LBB24_4: ; %Flow2
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT: s_cbranch_execz .LBB24_6
-; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_cbranch_execz .LBB24_2
+; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.private
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi
+; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
index 8d2963ce7db35..096fda1710928 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
@@ -3414,95 +3414,103 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 {
; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execz .LBB16_4
-; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX90A-NEXT: s_cbranch_execnz .LBB16_3
+; GFX90A-NEXT: ; %bb.1: ; %Flow3
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB16_6
+; GFX90A-NEXT: .LBB16_2: ; %atomicrmw.phi
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+; GFX90A-NEXT: .LBB16_3: ; %atomicrmw.global
+; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB16_4: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3]
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB16_2
-; GFX90A-NEXT: ; %bb.3: ; %Flow
+; GFX90A-NEXT: s_cbranch_execnz .LBB16_4
+; GFX90A-NEXT: ; %bb.5: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: .LBB16_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT: s_cbranch_execz .LBB16_6
-; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_cbranch_execz .LBB16_2
+; GFX90A-NEXT: .LBB16_6: ; %atomicrmw.private
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3]
-; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: .LBB16_6: ; %atomicrmw.phi
+; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3]
+; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fsub_ret_f64:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execz .LBB16_4
-; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT: s_cbranch_execnz .LBB16_3
+; GFX908-NEXT: ; %bb.1: ; %Flow3
+; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB16_6
+; GFX908-NEXT: .LBB16_2: ; %atomicrmw.phi
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+; GFX908-NEXT: .LBB16_3: ; %atomicrmw.global
+; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX908-NEXT: .LBB16_4: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3]
-; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT: v_mov_b32_e32 v9, v1
+; GFX908-NEXT: v_mov_b32_e32 v8, v0
+; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3]
+; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB16_2
-; GFX908-NEXT: ; %bb.3: ; %Flow
+; GFX908-NEXT: s_cbranch_execnz .LBB16_4
+; GFX908-NEXT: ; %bb.5: ; %Flow
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX908-NEXT: .LBB16_4: ; %Flow3
; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT: s_cbranch_execz .LBB16_6
-; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private
-; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; GFX908-NEXT: s_cbranch_execz .LBB16_2
+; GFX908-NEXT: .LBB16_6: ; %atomicrmw.private
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
+; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3]
-; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
-; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
-; GFX908-NEXT: .LBB16_6: ; %atomicrmw.phi
+; GFX908-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3]
+; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
-; GFX908-NEXT: v_mov_b32_e32 v1, v5
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
index 36bddb7ac2fd6..23dfe2f70fa7e 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
@@ -11158,52 +11158,56 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) {
; GCN3-LABEL: flat_atomic_max_i64_ret:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v5, v1
; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
-; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GCN3-NEXT: s_cbranch_execz .LBB82_4
-; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GCN3-NEXT: s_cbranch_execnz .LBB82_3
+; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB82_6
+; GCN3-NEXT: .LBB82_2: ; %atomicrmw.phi
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_setpc_b64 s[30:31]
+; GCN3-NEXT: .LBB82_3: ; %atomicrmw.global
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
; GCN3-NEXT: s_mov_b64 s[6:7], 0
-; GCN3-NEXT: .LBB82_2: ; %atomicrmw.start
+; GCN3-NEXT: .LBB82_4: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v7, v5
-; GCN3-NEXT: v_mov_b32_e32 v6, v4
-; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
-; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN3-NEXT: s_cbranch_execnz .LBB82_2
-; GCN3-NEXT: ; %bb.3: ; %Flow
+; GCN3-NEXT: s_cbranch_execnz .LBB82_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
-; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GCN3-NEXT: .LBB82_4: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execz .LBB82_6
-; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
-; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
-; GCN3-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
+; GCN3-NEXT: s_cbranch_execz .LBB82_2
+; GCN3-NEXT: .LBB82_6: ; %atomicrmw.private
+; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
+; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GCN3-NEXT: s_waitcnt vmcnt(0)
-; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[2:3]
-; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
-; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
-; GCN3-NEXT: .LBB82_6: ; %atomicrmw.phi
+; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v0, v4
-; GCN3-NEXT: v_mov_b32_e32 v1, v5
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw max ptr %ptr, i64 %in seq_cst
@@ -13861,52 +13865,56 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) {
; GCN3-LABEL: flat_atomic_umax_i64_ret:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v5, v1
; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
-; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GCN3-NEXT: s_cbranch_execz .LBB96_4
-; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GCN3-NEXT: s_cbranch_execnz .LBB96_3
+; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB96_6
+; GCN3-NEXT: .LBB96_2: ; %atomicrmw.phi
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_setpc_b64 s[30:31]
+; GCN3-NEXT: .LBB96_3: ; %atomicrmw.global
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
; GCN3-NEXT: s_mov_b64 s[6:7], 0
-; GCN3-NEXT: .LBB96_2: ; %atomicrmw.start
+; GCN3-NEXT: .LBB96_4: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v7, v5
-; GCN3-NEXT: v_mov_b32_e32 v6, v4
-; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
-; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN3-NEXT: s_cbranch_execnz .LBB96_2
-; GCN3-NEXT: ; %bb.3: ; %Flow
+; GCN3-NEXT: s_cbranch_execnz .LBB96_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
-; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GCN3-NEXT: .LBB96_4: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execz .LBB96_6
-; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
-; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
-; GCN3-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
+; GCN3-NEXT: s_cbranch_execz .LBB96_2
+; GCN3-NEXT: .LBB96_6: ; %atomicrmw.private
+; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
+; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GCN3-NEXT: s_waitcnt vmcnt(0)
-; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[2:3]
-; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
-; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
-; GCN3-NEXT: .LBB96_6: ; %atomicrmw.phi
+; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v0, v4
-; GCN3-NEXT: v_mov_b32_e32 v1, v5
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw umax ptr %ptr, i64 %in seq_cst
@@ -16355,52 +16363,56 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) {
; GCN3-LABEL: flat_atomic_umin_i64_ret:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v5, v1
; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
-; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GCN3-NEXT: s_cbranch_execz .LBB109_4
-; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GCN3-NEXT: s_cbranch_execnz .LBB109_3
+; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB109_6
+; GCN3-NEXT: .LBB109_2: ; %atomicrmw.phi
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_setpc_b64 s[30:31]
+; GCN3-NEXT: .LBB109_3: ; %atomicrmw.global
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
; GCN3-NEXT: s_mov_b64 s[6:7], 0
-; GCN3-NEXT: .LBB109_2: ; %atomicrmw.start
+; GCN3-NEXT: .LBB109_4: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v7, v5
-; GCN3-NEXT: v_mov_b32_e32 v6, v4
-; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
-; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN3-NEXT: s_cbranch_execnz .LBB109_2
-; GCN3-NEXT: ; %bb.3: ; %Flow
+; GCN3-NEXT: s_cbranch_execnz .LBB109_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
-; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GCN3-NEXT: .LBB109_4: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execz .LBB109_6
-; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
-; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
-; GCN3-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
+; GCN3-NEXT: s_cbranch_execz .LBB109_2
+; GCN3-NEXT: .LBB109_6: ; %atomicrmw.private
+; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
+; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GCN3-NEXT: s_waitcnt vmcnt(0)
-; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[4:5], v[2:3]
-; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
-; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
-; GCN3-NEXT: .LBB109_6: ; %atomicrmw.phi
+; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v0, v4
-; GCN3-NEXT: v_mov_b32_e32 v1, v5
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw umin ptr %ptr, i64 %in seq_cst
@@ -18206,52 +18218,56 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) {
; GCN3-LABEL: flat_atomic_min_i64_ret:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v5, v1
; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
-; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GCN3-NEXT: s_cbranch_execz .LBB119_4
-; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GCN3-NEXT: s_cbranch_execnz .LBB119_3
+; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB119_6
+; GCN3-NEXT: .LBB119_2: ; %atomicrmw.phi
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_setpc_b64 s[30:31]
+; GCN3-NEXT: .LBB119_3: ; %atomicrmw.global
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
; GCN3-NEXT: s_mov_b64 s[6:7], 0
-; GCN3-NEXT: .LBB119_2: ; %atomicrmw.start
+; GCN3-NEXT: .LBB119_4: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v7, v5
-; GCN3-NEXT: v_mov_b32_e32 v6, v4
-; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
-; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN3-NEXT: s_cbranch_execnz .LBB119_2
-; GCN3-NEXT: ; %bb.3: ; %Flow
+; GCN3-NEXT: s_cbranch_execnz .LBB119_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
-; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GCN3-NEXT: .LBB119_4: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execz .LBB119_6
-; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
-; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
-; GCN3-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
+; GCN3-NEXT: s_cbranch_execz .LBB119_2
+; GCN3-NEXT: .LBB119_6: ; %atomicrmw.private
+; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
+; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GCN3-NEXT: s_waitcnt vmcnt(0)
-; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[4:5], v[2:3]
-; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
-; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
-; GCN3-NEXT: .LBB119_6: ; %atomicrmw.phi
+; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v0, v4
-; GCN3-NEXT: v_mov_b32_e32 v1, v5
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw min ptr %ptr, i64 %in seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index 3e6b812c12d7f..9c9c0555638fb 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -739,22 +739,22 @@ define i128 @fptosi_f32_to_i128(float %x) {
; SDAG: ; %bb.0: ; %fp-to-i-entry
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v4, v0
-; SDAG-NEXT: v_bfe_u32 v6, v4, 23, 8
+; SDAG-NEXT: v_bfe_u32 v5, v4, 23, 8
; SDAG-NEXT: s_movk_i32 s4, 0x7e
; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: v_mov_b32_e32 v2, 0
-; SDAG-NEXT: v_mov_b32_e32 v7, 0
+; SDAG-NEXT: v_mov_b32_e32 v6, 0
; SDAG-NEXT: v_mov_b32_e32 v1, 0
; SDAG-NEXT: v_mov_b32_e32 v3, 0
-; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v6
+; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5
; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc
; SDAG-NEXT: s_cbranch_execz .LBB2_10
; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end
-; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v6
-; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
-; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
+; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
+; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
+; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
; SDAG-NEXT: s_movk_i32 s6, 0xff7f
-; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
+; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
; SDAG-NEXT: s_mov_b32 s7, -1
; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
@@ -770,65 +770,65 @@ define i128 @fptosi_f32_to_i128(float %x) {
; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0
; SDAG-NEXT: s_mov_b64 s[4:5], 0x95
; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7]
-; SDAG-NEXT: v_mov_b32_e32 v5, 0
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6]
+; SDAG-NEXT: v_mov_b32_e32 v7, 0
; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc
; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc
-; SDAG-NEXT: v_or_b32_e32 v4, 0x800000, v0
+; SDAG-NEXT: v_or_b32_e32 v6, 0x800000, v0
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7]
; SDAG-NEXT: s_cbranch_execz .LBB2_4
; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else
-; SDAG-NEXT: v_sub_u32_e32 v0, 0xd6, v6
-; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff2a, v6
-; SDAG-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6
-; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5]
-; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5]
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7
+; SDAG-NEXT: v_sub_u32_e32 v0, 0xd6, v5
+; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff2a, v5
+; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff6a, v5
+; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7]
+; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7]
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4
; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7
+; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4
; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5]
-; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5]
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5]
; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0
; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2
; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3
-; SDAG-NEXT: v_mov_b32_e32 v4, v1
-; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v6, v1
+; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, v[6:7]
; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0
-; SDAG-NEXT: v_mov_b32_e32 v4, v6
-; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v6, v4
+; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v8, v[6:7]
; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13
; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3]
-; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v7, v5
-; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT: v_mul_lo_u32 v3, v9, v11
+; SDAG-NEXT: v_add_co_u32_e64 v3, s[4:5], v5, v7
+; SDAG-NEXT: v_addc_co_u32_e64 v4, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_mul_lo_u32 v10, v9, v11
; SDAG-NEXT: v_mul_lo_u32 v9, v9, v12
-; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6]
-; SDAG-NEXT: ; implicit-def: $vgpr10
+; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v8, v[3:4]
; SDAG-NEXT: ; implicit-def: $vgpr8
-; SDAG-NEXT: v_add3_u32 v3, v9, v2, v3
-; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1
-; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
-; SDAG-NEXT: v_mov_b32_e32 v1, v4
+; SDAG-NEXT: v_add3_u32 v5, v9, v2, v10
+; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v3, v1
+; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v4, v5, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v1, v6
+; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6
; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
-; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT: ; implicit-def: $vgpr10
; SDAG-NEXT: ; implicit-def: $vgpr9
; SDAG-NEXT: .LBB2_4: ; %Flow
; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13]
; SDAG-NEXT: s_cbranch_execz .LBB2_6
; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12
-; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v6
-; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[4:5]
+; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v5
+; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7]
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2
; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5]
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v4, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v6, s[4:5]
; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v10, 0
; SDAG-NEXT: v_mov_b32_e32 v2, 0
; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, v[1:2]
@@ -944,9 +944,9 @@ define i128 @fptosi_f32_to_i128(float %x) {
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GISEL-NEXT: v_lshl_or_b32 v9, v0, 16, v0
-; GISEL-NEXT: v_or3_b32 v10, v1, v2, 1
-; GISEL-NEXT: v_or3_b32 v8, v0, v2, 0
+; GISEL-NEXT: v_lshl_or_b32 v10, v0, 16, v0
+; GISEL-NEXT: v_or3_b32 v8, v1, v2, 1
+; GISEL-NEXT: v_or3_b32 v9, v0, v2, 0
; GISEL-NEXT: v_mov_b32_e32 v0, 0x96
; GISEL-NEXT: v_mov_b32_e32 v1, 0
; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4
@@ -963,33 +963,33 @@ define i128 @fptosi_f32_to_i128(float %x) {
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0
; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6
; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7
; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5]
; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v10, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v10, 0
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0
; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v11, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v8, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v12, v9
+; GISEL-NEXT: v_mul_lo_u32 v6, v11, v10
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v12, v10
; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v10, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2]
; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[4:5]
; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: ; implicit-def: $vgpr10
; GISEL-NEXT: ; implicit-def: $vgpr9
-; GISEL-NEXT: ; implicit-def: $vgpr8
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v10, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr6
; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GISEL-NEXT: ; implicit-def: $vgpr10
+; GISEL-NEXT: ; implicit-def: $vgpr8
; GISEL-NEXT: .LBB2_4: ; %Flow
; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17]
; GISEL-NEXT: s_cbranch_execz .LBB2_6
@@ -1000,10 +1000,10 @@ define i128 @fptosi_f32_to_i128(float %x) {
; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v10, 0
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v9, 0
-; GISEL-NEXT: v_mul_lo_u32 v5, v4, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v8, 0
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v10, 0
+; GISEL-NEXT: v_mul_lo_u32 v5, v4, v10
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2]
; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
; GISEL-NEXT: .LBB2_6: ; %Flow1
; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
@@ -1093,22 +1093,22 @@ define i128 @fptoui_f32_to_i128(float %x) {
; SDAG: ; %bb.0: ; %fp-to-i-entry
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v4, v0
-; SDAG-NEXT: v_bfe_u32 v6, v4, 23, 8
+; SDAG-NEXT: v_bfe_u32 v5, v4, 23, 8
; SDAG-NEXT: s_movk_i32 s4, 0x7e
; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: v_mov_b32_e32 v2, 0
-; SDAG-NEXT: v_mov_b32_e32 v7, 0
+; SDAG-NEXT: v_mov_b32_e32 v6, 0
; SDAG-NEXT: v_mov_b32_e32 v1, 0
; SDAG-NEXT: v_mov_b32_e32 v3, 0
-; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v6
+; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5
; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc
; SDAG-NEXT: s_cbranch_execz .LBB3_10
; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end
-; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v6
-; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
-; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
+; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
+; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
+; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
; SDAG-NEXT: s_movk_i32 s6, 0xff7f
-; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
+; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
; SDAG-NEXT: s_mov_b32 s7, -1
; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
@@ -1124,65 +1124,65 @@ define i128 @fptoui_f32_to_i128(float %x) {
; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0
; SDAG-NEXT: s_mov_b64 s[4:5], 0x95
; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7]
-; SDAG-NEXT: v_mov_b32_e32 v5, 0
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6]
+; SDAG-NEXT: v_mov_b32_e32 v7, 0
; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc
; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc
-; SDAG-NEXT: v_or_b32_e32 v4, 0x800000, v0
+; SDAG-NEXT: v_or_b32_e32 v6, 0x800000, v0
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7]
; SDAG-NEXT: s_cbranch_execz .LBB3_4
; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else
-; SDAG-NEXT: v_sub_u32_e32 v0, 0xd6, v6
-; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff2a, v6
-; SDAG-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6
-; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5]
-; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5]
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7
+; SDAG-NEXT: v_sub_u32_e32 v0, 0xd6, v5
+; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff2a, v5
+; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff6a, v5
+; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7]
+; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7]
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4
; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7
+; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4
; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5]
-; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5]
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5]
; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0
; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2
; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3
-; SDAG-NEXT: v_mov_b32_e32 v4, v1
-; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v6, v1
+; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, v[6:7]
; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0
-; SDAG-NEXT: v_mov_b32_e32 v4, v6
-; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v6, v4
+; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v8, v[6:7]
; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13
; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3]
-; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v7, v5
-; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT: v_mul_lo_u32 v3, v9, v11
+; SDAG-NEXT: v_add_co_u32_e64 v3, s[4:5], v5, v7
+; SDAG-NEXT: v_addc_co_u32_e64 v4, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_mul_lo_u32 v10, v9, v11
; SDAG-NEXT: v_mul_lo_u32 v9, v9, v12
-; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6]
-; SDAG-NEXT: ; implicit-def: $vgpr10
+; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v8, v[3:4]
; SDAG-NEXT: ; implicit-def: $vgpr8
-; SDAG-NEXT: v_add3_u32 v3, v9, v2, v3
-; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1
-; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
-; SDAG-NEXT: v_mov_b32_e32 v1, v4
+; SDAG-NEXT: v_add3_u32 v5, v9, v2, v10
+; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v3, v1
+; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v4, v5, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v1, v6
+; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6
; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
-; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT: ; implicit-def: $vgpr10
; SDAG-NEXT: ; implicit-def: $vgpr9
; SDAG-NEXT: .LBB3_4: ; %Flow
; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13]
; SDAG-NEXT: s_cbranch_execz .LBB3_6
; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12
-; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v6
-; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[4:5]
+; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v5
+; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7]
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2
; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5]
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v4, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v6, s[4:5]
; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v10, 0
; SDAG-NEXT: v_mov_b32_e32 v2, 0
; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, v[1:2]
@@ -1298,9 +1298,9 @@ define i128 @fptoui_f32_to_i128(float %x) {
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GISEL-NEXT: v_lshl_or_b32 v9, v0, 16, v0
-; GISEL-NEXT: v_or3_b32 v10, v1, v2, 1
-; GISEL-NEXT: v_or3_b32 v8, v0, v2, 0
+; GISEL-NEXT: v_lshl_or_b32 v10, v0, 16, v0
+; GISEL-NEXT: v_or3_b32 v8, v1, v2, 1
+; GISEL-NEXT: v_or3_b32 v9, v0, v2, 0
; GISEL-NEXT: v_mov_b32_e32 v0, 0x96
; GISEL-NEXT: v_mov_b32_e32 v1, 0
; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4
@@ -1317,33 +1317,33 @@ define i128 @fptoui_f32_to_i128(float %x) {
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0
; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6
; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7
; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5]
; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v10, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v10, 0
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0
; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v11, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v8, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v12, v9
+; GISEL-NEXT: v_mul_lo_u32 v6, v11, v10
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v12, v10
; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v10, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2]
; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[4:5]
; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: ; implicit-def: $vgpr10
; GISEL-NEXT: ; implicit-def: $vgpr9
-; GISEL-NEXT: ; implicit-def: $vgpr8
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v10, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr6
; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GISEL-NEXT: ; implicit-def: $vgpr10
+; GISEL-NEXT: ; implicit-def: $vgpr8
; GISEL-NEXT: .LBB3_4: ; %Flow
; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17]
; GISEL-NEXT: s_cbranch_execz .LBB3_6
@@ -1354,10 +1354,10 @@ define i128 @fptoui_f32_to_i128(float %x) {
; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v10, 0
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v9, 0
-; GISEL-NEXT: v_mul_lo_u32 v5, v4, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v8, 0
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v10, 0
+; GISEL-NEXT: v_mul_lo_u32 v5, v4, v10
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2]
; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
; GISEL-NEXT: .LBB3_6: ; %Flow1
; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
@@ -1475,22 +1475,22 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
; SDAG: ; %bb.0: ; %fp-to-i-entry
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v4, v0
-; SDAG-NEXT: v_bfe_u32 v6, v4, 7, 8
+; SDAG-NEXT: v_bfe_u32 v5, v4, 7, 8
; SDAG-NEXT: s_movk_i32 s4, 0x7e
; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: v_mov_b32_e32 v2, 0
-; SDAG-NEXT: v_mov_b32_e32 v7, 0
+; SDAG-NEXT: v_mov_b32_e32 v6, 0
; SDAG-NEXT: v_mov_b32_e32 v1, 0
; SDAG-NEXT: v_mov_b32_e32 v3, 0
-; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v6
+; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5
; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc
; SDAG-NEXT: s_cbranch_execz .LBB6_10
; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end
-; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v6
-; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
-; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
+; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
+; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
+; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
; SDAG-NEXT: s_movk_i32 s6, 0xff7f
-; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
+; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
; SDAG-NEXT: s_mov_b32 s7, -1
; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
@@ -1505,11 +1505,11 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
; SDAG-NEXT: s_movk_i32 s4, 0x7f
; SDAG-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; SDAG-NEXT: s_mov_b64 s[4:5], 0x85
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7]
-; SDAG-NEXT: v_mov_b32_e32 v5, 0
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6]
+; SDAG-NEXT: v_mov_b32_e32 v7, 0
; SDAG-NEXT: v_cndmask_b32_e64 v9, -1, 0, vcc
; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 1, vcc
-; SDAG-NEXT: v_or_b32_e32 v4, 0x80, v0
+; SDAG-NEXT: v_or_b32_e32 v6, 0x80, v0
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
@@ -1518,52 +1518,52 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else
; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SDAG-NEXT: v_add_co_u32_e64 v10, s[4:5], -1, v0
-; SDAG-NEXT: v_sub_u32_e32 v0, 0xc6, v6
-; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff3a, v6
-; SDAG-NEXT: v_add_u32_e32 v7, 0xffffff7a, v6
-; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5]
-; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5]
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7
+; SDAG-NEXT: v_sub_u32_e32 v0, 0xc6, v5
+; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff3a, v5
+; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff7a, v5
+; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7]
+; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7]
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4
; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7
+; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4
; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5]
-; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5]
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5]
; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v8, 0
; SDAG-NEXT: v_mul_lo_u32 v13, v9, v2
; SDAG-NEXT: v_mul_lo_u32 v14, v8, v3
-; SDAG-NEXT: v_mov_b32_e32 v4, v1
-; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v8, v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v6, v1
+; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v8, v[6:7]
; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v2, 0
-; SDAG-NEXT: v_mov_b32_e32 v4, v6
-; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v6, v4
+; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v9, v[6:7]
; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13
; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v12, v[2:3]
-; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v7, v5
-; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT: v_mul_lo_u32 v3, v10, v11
-; SDAG-NEXT: v_mul_lo_u32 v8, v10, v12
-; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[5:6]
-; SDAG-NEXT: v_add3_u32 v3, v8, v2, v3
-; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1
-; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
-; SDAG-NEXT: v_mov_b32_e32 v1, v4
+; SDAG-NEXT: v_add_co_u32_e64 v3, s[4:5], v5, v7
+; SDAG-NEXT: v_addc_co_u32_e64 v4, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_mul_lo_u32 v8, v10, v11
+; SDAG-NEXT: v_mul_lo_u32 v10, v10, v12
+; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, v[3:4]
+; SDAG-NEXT: v_add3_u32 v5, v10, v2, v8
+; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v3, v1
+; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v4, v5, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v1, v6
+; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6
; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
-; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; SDAG-NEXT: ; implicit-def: $vgpr8
; SDAG-NEXT: .LBB6_4: ; %Flow
; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13]
; SDAG-NEXT: s_cbranch_execz .LBB6_6
; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12
-; SDAG-NEXT: v_sub_u32_e32 v2, 0x86, v6
-; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[4:5]
+; SDAG-NEXT: v_sub_u32_e32 v2, 0x86, v5
+; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7]
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2
; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5]
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
-; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5]
; SDAG-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v8
; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; SDAG-NEXT: v_mul_i32_i24_e32 v0, v0, v8
@@ -1823,22 +1823,22 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
; SDAG: ; %bb.0: ; %fp-to-i-entry
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v4, v0
-; SDAG-NEXT: v_bfe_u32 v6, v4, 7, 8
+; SDAG-NEXT: v_bfe_u32 v5, v4, 7, 8
; SDAG-NEXT: s_movk_i32 s4, 0x7e
; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: v_mov_b32_e32 v2, 0
-; SDAG-NEXT: v_mov_b32_e32 v7, 0
+; SDAG-NEXT: v_mov_b32_e32 v6, 0
; SDAG-NEXT: v_mov_b32_e32 v1, 0
; SDAG-NEXT: v_mov_b32_e32 v3, 0
-; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v6
+; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5
; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc
; SDAG-NEXT: s_cbranch_execz .LBB7_10
; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end
-; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v6
-; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
-; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
+; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
+; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
+; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
; SDAG-NEXT: s_movk_i32 s6, 0xff7f
-; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
+; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
; SDAG-NEXT: s_mov_b32 s7, -1
; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
@@ -1853,11 +1853,11 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
; SDAG-NEXT: s_movk_i32 s4, 0x7f
; SDAG-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; SDAG-NEXT: s_mov_b64 s[4:5], 0x85
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7]
-; SDAG-NEXT: v_mov_b32_e32 v5, 0
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6]
+; SDAG-NEXT: v_mov_b32_e32 v7, 0
; SDAG-NEXT: v_cndmask_b32_e64 v9, -1, 0, vcc
; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 1, vcc
-; SDAG-NEXT: v_or_b32_e32 v4, 0x80, v0
+; SDAG-NEXT: v_or_b32_e32 v6, 0x80, v0
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
@@ -1866,52 +1866,52 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else
; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SDAG-NEXT: v_add_co_u32_e64 v10, s[4:5], -1, v0
-; SDAG-NEXT: v_sub_u32_e32 v0, 0xc6, v6
-; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff3a, v6
-; SDAG-NEXT: v_add_u32_e32 v7, 0xffffff7a, v6
-; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5]
-; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5]
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7
+; SDAG-NEXT: v_sub_u32_e32 v0, 0xc6, v5
+; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff3a, v5
+; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff7a, v5
+; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7]
+; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7]
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4
; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7
+; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4
; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5]
-; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5]
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5]
; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v8, 0
; SDAG-NEXT: v_mul_lo_u32 v13, v9, v2
; SDAG-NEXT: v_mul_lo_u32 v14, v8, v3
-; SDAG-NEXT: v_mov_b32_e32 v4, v1
-; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v8, v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v6, v1
+; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v8, v[6:7]
; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v2, 0
-; SDAG-NEXT: v_mov_b32_e32 v4, v6
-; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v6, v4
+; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v9, v[6:7]
; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13
; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v12, v[2:3]
-; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v7, v5
-; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT: v_mul_lo_u32 v3, v10, v11
-; SDAG-NEXT: v_mul_lo_u32 v8, v10, v12
-; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[5:6]
-; SDAG-NEXT: v_add3_u32 v3, v8, v2, v3
-; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1
-; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
-; SDAG-NEXT: v_mov_b32_e32 v1, v4
+; SDAG-NEXT: v_add_co_u32_e64 v3, s[4:5], v5, v7
+; SDAG-NEXT: v_addc_co_u32_e64 v4, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_mul_lo_u32 v8, v10, v11
+; SDAG-NEXT: v_mul_lo_u32 v10, v10, v12
+; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, v[3:4]
+; SDAG-NEXT: v_add3_u32 v5, v10, v2, v8
+; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v3, v1
+; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v4, v5, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v1, v6
+; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6
; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
-; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; SDAG-NEXT: ; implicit-def: $vgpr8
; SDAG-NEXT: .LBB7_4: ; %Flow
; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13]
; SDAG-NEXT: s_cbranch_execz .LBB7_6
; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12
-; SDAG-NEXT: v_sub_u32_e32 v2, 0x86, v6
-; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[4:5]
+; SDAG-NEXT: v_sub_u32_e32 v2, 0x86, v5
+; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7]
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2
; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5]
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
-; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5]
; SDAG-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v8
; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; SDAG-NEXT: v_mul_i32_i24_e32 v0, v0, v8
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 051a0c51b0867..3d3e8bea7e33e 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -155,61 +155,61 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
;
; VI-SAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; VI-SAFE-SDAG: ; %bb.0:
-; VI-SAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SAFE-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SAFE-SDAG-NEXT: s_mov_b32 s6, -1
+; VI-SAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-SAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SAFE-SDAG-NEXT: s_mov_b32 s2, -1
; VI-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SAFE-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SAFE-SDAG-NEXT: s_lshr_b32 s0, s3, 8
-; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s0, 0xffe
-; VI-SAFE-SDAG-NEXT: s_and_b32 s0, s3, 0x1ff
-; VI-SAFE-SDAG-NEXT: s_or_b32 s0, s0, s2
-; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s0, 0
-; VI-SAFE-SDAG-NEXT: s_mov_b32 s5, s1
-; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s0, v0
-; VI-SAFE-SDAG-NEXT: s_bfe_u32 s1, s3, 0xb0014
-; VI-SAFE-SDAG-NEXT: s_or_b32 s2, s8, s0
-; VI-SAFE-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s1
+; VI-SAFE-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SAFE-SDAG-NEXT: s_lshr_b32 s4, s7, 8
+; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s4, 0xffe
+; VI-SAFE-SDAG-NEXT: s_and_b32 s4, s7, 0x1ff
+; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s6
+; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0
+; VI-SAFE-SDAG-NEXT: s_mov_b32 s1, s5
+; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0
+; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s4, v0
+; VI-SAFE-SDAG-NEXT: s_bfe_u32 s5, s7, 0xb0014
+; VI-SAFE-SDAG-NEXT: s_or_b32 s6, s8, s4
+; VI-SAFE-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s5
; VI-SAFE-SDAG-NEXT: v_med3_i32 v0, s8, 0, 13
-; VI-SAFE-SDAG-NEXT: s_or_b32 s0, s2, 0x1000
+; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s6, 0x1000
; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s8, v0
-; VI-SAFE-SDAG-NEXT: s_lshr_b32 s8, s0, s8
+; VI-SAFE-SDAG-NEXT: s_lshr_b32 s8, s4, s8
; VI-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v0, s8
-; VI-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s0, v0
+; VI-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-SAFE-SDAG-NEXT: s_add_i32 s10, s1, 0xfffffc10
-; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s0, v0
-; VI-SAFE-SDAG-NEXT: s_lshl_b32 s1, s10, 12
-; VI-SAFE-SDAG-NEXT: s_or_b32 s0, s8, s0
-; VI-SAFE-SDAG-NEXT: s_or_b32 s1, s2, s1
+; VI-SAFE-SDAG-NEXT: s_add_i32 s10, s5, 0xfffffc10
+; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s4, v0
+; VI-SAFE-SDAG-NEXT: s_lshl_b32 s5, s10, 12
+; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s8, s4
+; VI-SAFE-SDAG-NEXT: s_or_b32 s5, s6, s5
; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s10, 1
-; VI-SAFE-SDAG-NEXT: s_cselect_b32 s11, s0, s1
+; VI-SAFE-SDAG-NEXT: s_cselect_b32 s11, s4, s5
; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s11, 7
; VI-SAFE-SDAG-NEXT: s_cmp_gt_i32 s8, 5
-; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0
; VI-SAFE-SDAG-NEXT: s_cmp_eq_u32 s8, 3
; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[8:9], -1, 0
-; VI-SAFE-SDAG-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1]
+; VI-SAFE-SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
; VI-SAFE-SDAG-NEXT: s_lshr_b32 s8, s11, 2
-; VI-SAFE-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
-; VI-SAFE-SDAG-NEXT: s_addc_u32 s0, s8, 0
+; VI-SAFE-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
+; VI-SAFE-SDAG-NEXT: s_addc_u32 s4, s8, 0
; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s10, 31
-; VI-SAFE-SDAG-NEXT: s_cselect_b32 s8, s0, 0x7c00
-; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s2, 0
-; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-SAFE-SDAG-NEXT: s_cselect_b32 s8, s4, 0x7c00
+; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, 0
+; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0
+; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; VI-SAFE-SDAG-NEXT: v_lshlrev_b32_e32 v0, 9, v0
; VI-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f
; VI-SAFE-SDAG-NEXT: v_or_b32_e32 v0, 0x7c00, v0
; VI-SAFE-SDAG-NEXT: v_mov_b32_e32 v1, s8
; VI-SAFE-SDAG-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-SAFE-SDAG-NEXT: s_lshr_b32 s0, s3, 16
+; VI-SAFE-SDAG-NEXT: s_lshr_b32 s4, s7, 16
; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; VI-SAFE-SDAG-NEXT: s_and_b32 s0, s0, 0x8000
-; VI-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s0, v0
-; VI-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-SAFE-SDAG-NEXT: s_and_b32 s4, s4, 0x8000
+; VI-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s4, v0
+; VI-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-SAFE-SDAG-NEXT: s_endpgm
;
; VI-SAFE-GISEL-LABEL: fptrunc_f64_to_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 78e521aba120e..d702de02dbefb 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -925,11 +925,10 @@ define void @void_func_v2i8(<2 x i8> %arg0) #0 {
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; CI-NEXT: v_and_b32_e32 v0, 0xff, v0
-; CI-NEXT: s_mov_b32 s4, 0
; CI-NEXT: v_or_b32_e32 v0, v0, v1
+; CI-NEXT: s_mov_b64 s[4:5], 0
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: s_mov_b32 s5, s4
; CI-NEXT: buffer_store_short v0, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_setpc_b64 s[30:31]
@@ -938,11 +937,10 @@ define void @void_func_v2i8(<2 x i8> %arg0) #0 {
; GFX89: ; %bb.0:
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX89-NEXT: s_mov_b32 s4, 0
; GFX89-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX89-NEXT: s_mov_b64 s[4:5], 0
; GFX89-NEXT: s_mov_b32 s7, 0xf000
; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s5, s4
; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_setpc_b64 s[30:31]
@@ -952,10 +950,10 @@ define void @void_func_v2i8(<2 x i8> %arg0) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_mov_b32 s1, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1002,13 +1000,12 @@ define void @void_func_v3i8(<3 x i8> %arg0) #0 {
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; CI-NEXT: v_and_b32_e32 v0, 0xff, v0
-; CI-NEXT: s_mov_b32 s5, 0
-; CI-NEXT: s_mov_b32 s4, 2
+; CI-NEXT: s_mov_b64 s[4:5], 2
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
; CI-NEXT: v_or_b32_e32 v0, v0, v1
; CI-NEXT: buffer_store_byte v2, off, s[4:7], 0
-; CI-NEXT: s_mov_b32 s4, s5
+; CI-NEXT: s_mov_b64 s[4:5], 0
; CI-NEXT: buffer_store_short v0, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_setpc_b64 s[30:31]
@@ -1017,13 +1014,12 @@ define void @void_func_v3i8(<3 x i8> %arg0) #0 {
; GFX89: ; %bb.0:
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX89-NEXT: s_mov_b32 s5, 0
-; GFX89-NEXT: s_mov_b32 s4, 2
+; GFX89-NEXT: s_mov_b64 s[4:5], 2
; GFX89-NEXT: s_mov_b32 s7, 0xf000
; GFX89-NEXT: s_mov_b32 s6, -1
; GFX89-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX89-NEXT: buffer_store_byte v2, off, s[4:7], 0
-; GFX89-NEXT: s_mov_b32 s4, s5
+; GFX89-NEXT: s_mov_b64 s[4:5], 0
; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_setpc_b64 s[30:31]
@@ -1033,13 +1029,13 @@ define void @void_func_v3i8(<3 x i8> %arg0) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_mov_b32 s0, 2
+; GFX11-NEXT: s_mov_b64 s[0:1], 2
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-NEXT: buffer_store_b8 v2, off, s[0:3], 0
-; GFX11-NEXT: s_mov_b32 s0, s1
+; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_setpc_b64 s[30:31]
store <3 x i8> %arg0, ptr addrspace(1) null
@@ -1058,11 +1054,10 @@ define void @void_func_v4i8(<4 x i8> %arg0) #0 {
; CI-NEXT: v_or_b32_e32 v0, v0, v1
; CI-NEXT: v_or_b32_e32 v2, v3, v2
; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; CI-NEXT: s_mov_b32 s4, 0
; CI-NEXT: v_or_b32_e32 v0, v0, v2
+; CI-NEXT: s_mov_b64 s[4:5], 0
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: s_mov_b32 s5, s4
; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_setpc_b64 s[30:31]
@@ -1074,11 +1069,10 @@ define void @void_func_v4i8(<4 x i8> %arg0) #0 {
; GFX89-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v3
; GFX89-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX89-NEXT: s_mov_b32 s4, 0
; GFX89-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX89-NEXT: s_mov_b64 s[4:5], 0
; GFX89-NEXT: s_mov_b32 s7, 0xf000
; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s5, s4
; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_setpc_b64 s[30:31]
@@ -1090,12 +1084,11 @@ define void @void_func_v4i8(<4 x i8> %arg0) #0 {
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3
; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -1119,13 +1112,12 @@ define void @void_func_v5i8(<5 x i8> %arg0) #0 {
; CI-NEXT: v_or_b32_e32 v0, v0, v1
; CI-NEXT: v_or_b32_e32 v2, v3, v2
; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; CI-NEXT: s_mov_b32 s5, 0
-; CI-NEXT: s_mov_b32 s4, 4
+; CI-NEXT: s_mov_b64 s[4:5], 4
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
; CI-NEXT: v_or_b32_e32 v0, v0, v2
; CI-NEXT: buffer_store_byte v4, off, s[4:7], 0
-; CI-NEXT: s_mov_b32 s4, s5
+; CI-NEXT: s_mov_b64 s[4:5], 0
; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_setpc_b64 s[30:31]
@@ -1137,13 +1129,12 @@ define void @void_func_v5i8(<5 x i8> %arg0) #0 {
; GFX89-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v3
; GFX89-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX89-NEXT: s_mov_b32 s5, 0
-; GFX89-NEXT: s_mov_b32 s4, 4
+; GFX89-NEXT: s_mov_b64 s[4:5], 4
; GFX89-NEXT: s_mov_b32 s7, 0xf000
; GFX89-NEXT: s_mov_b32 s6, -1
; GFX89-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX89-NEXT: buffer_store_byte v4, off, s[4:7], 0
-; GFX89-NEXT: s_mov_b32 s4, s5
+; GFX89-NEXT: s_mov_b64 s[4:5], 0
; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_setpc_b64 s[30:31]
@@ -1155,17 +1146,16 @@ define void @void_func_v5i8(<5 x i8> %arg0) #0 {
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3
; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_mov_b32 s0, 4
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT: s_mov_b64 s[0:1], 4
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: v_or_b32_e32 v1, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-NEXT: buffer_store_b8 v4, off, s[0:3], 0
+; GFX11-NEXT: s_mov_b64 s[0:1], 0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_mov_b32 s0, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1193,12 +1183,11 @@ define void @void_func_v8i8(<8 x i8> %arg0) #0 {
; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4
; CI-NEXT: v_or_b32_e32 v2, v3, v2
; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; CI-NEXT: s_mov_b32 s4, 0
; CI-NEXT: v_or_b32_e32 v4, v4, v6
; CI-NEXT: v_or_b32_e32 v3, v0, v2
+; CI-NEXT: s_mov_b64 s[4:5], 0
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: s_mov_b32 s5, s4
; CI-NEXT: buffer_store_dwordx2 v[3:4], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_setpc_b64 s[30:31]
@@ -1214,12 +1203,11 @@ define void @void_func_v8i8(<8 x i8> %arg0) #0 {
; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v3
; GFX89-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX89-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX89-NEXT: s_mov_b32 s4, 0
; GFX89-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX89-NEXT: v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX89-NEXT: s_mov_b64 s[4:5], 0
; GFX89-NEXT: s_mov_b32 s7, 0xf000
; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s5, s4
; GFX89-NEXT: buffer_store_dwordx2 v[3:4], off, s[4:7], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_setpc_b64 s[30:31]
@@ -1238,7 +1226,7 @@ define void @void_func_v8i8(<8 x i8> %arg0) #0 {
; GFX11-NEXT: v_or_b32_e32 v4, v4, v5
; GFX11-NEXT: v_or_b32_e32 v5, v6, v7
; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: v_or_b32_e32 v1, v2, v3
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v4
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v5
@@ -1247,7 +1235,6 @@ define void @void_func_v8i8(<8 x i8> %arg0) #0 {
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_or_b32_e32 v0, v0, v4
; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
@@ -1292,14 +1279,13 @@ define void @void_func_v16i8(<16 x i8> %arg0) #0 {
; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4
; CI-NEXT: v_or_b32_e32 v2, v3, v2
; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; CI-NEXT: s_mov_b32 s4, 0
; CI-NEXT: v_or_b32_e32 v12, v12, v14
; CI-NEXT: v_or_b32_e32 v11, v8, v10
; CI-NEXT: v_or_b32_e32 v10, v4, v6
; CI-NEXT: v_or_b32_e32 v9, v0, v2
+; CI-NEXT: s_mov_b64 s[4:5], 0
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: s_mov_b32 s5, s4
; CI-NEXT: buffer_store_dwordx4 v[9:12], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_setpc_b64 s[30:31]
@@ -1323,14 +1309,13 @@ define void @void_func_v16i8(<16 x i8> %arg0) #0 {
; GFX89-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX89-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX89-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX89-NEXT: s_mov_b32 s4, 0
; GFX89-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX89-NEXT: v_or_b32_sdwa v11, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX89-NEXT: v_or_b32_sdwa v10, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX89-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX89-NEXT: s_mov_b64 s[4:5], 0
; GFX89-NEXT: s_mov_b32 s7, 0xf000
; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s5, s4
; GFX89-NEXT: buffer_store_dwordx4 v[9:12], off, s[4:7], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_setpc_b64 s[30:31]
@@ -1373,11 +1358,10 @@ define void @void_func_v16i8(<16 x i8> %arg0) #0 {
; GFX11-NEXT: v_or_b32_e32 v3, v9, v12
; GFX11-NEXT: v_or_b32_e32 v2, v8, v2
; GFX11-NEXT: v_or_b32_e32 v1, v4, v5
-; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: v_or_b32_e32 v0, v0, v6
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
; GFX11-NEXT: s_setpc_b64 s[30:31]
store <16 x i8> %arg0, ptr addrspace(1) null
@@ -1439,8 +1423,7 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 {
; CI-NEXT: v_or_b32_e32 v2, v8, v9
; CI-NEXT: v_and_b32_e32 v8, 0xff, v20
; CI-NEXT: v_and_b32_e32 v9, 0xff, v16
-; CI-NEXT: s_mov_b32 s5, 0
-; CI-NEXT: s_mov_b32 s4, 16
+; CI-NEXT: s_mov_b64 s[4:5], 16
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1467,7 +1450,7 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 {
; CI-NEXT: v_and_b32_e32 v8, 0xffff, v8
; CI-NEXT: v_or_b32_e32 v4, v8, v4
; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
-; CI-NEXT: s_mov_b32 s4, s5
+; CI-NEXT: s_mov_b64 s[4:5], 0
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_setpc_b64 s[30:31]
@@ -1507,8 +1490,7 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 {
; GFX89-NEXT: v_or_b32_sdwa v20, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX89-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX89-NEXT: v_or_b32_sdwa v17, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX89-NEXT: s_mov_b32 s5, 0
-; GFX89-NEXT: s_mov_b32 s4, 16
+; GFX89-NEXT: s_mov_b64 s[4:5], 16
; GFX89-NEXT: s_mov_b32 s7, 0xf000
; GFX89-NEXT: s_mov_b32 s6, -1
; GFX89-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -1522,7 +1504,7 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 {
; GFX89-NEXT: v_or_b32_sdwa v8, v30, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX89-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
-; GFX89-NEXT: s_mov_b32 s4, s5
+; GFX89-NEXT: s_mov_b64 s[4:5], 0
; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_setpc_b64 s[30:31]
@@ -1598,8 +1580,7 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 {
; GFX11-NEXT: v_or_b32_e32 v3, v12, v13
; GFX11-NEXT: v_or_b32_e32 v2, v8, v9
; GFX11-NEXT: v_or_b32_e32 v0, v0, v17
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_mov_b32 s0, 16
+; GFX11-NEXT: s_mov_b64 s[0:1], 16
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1611,7 +1592,7 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 {
; GFX11-NEXT: v_or_b32_e32 v7, v18, v1
; GFX11-NEXT: v_or_b32_e32 v1, v15, v16
; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0
-; GFX11-NEXT: s_mov_b32 s0, s1
+; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
; GFX11-NEXT: s_setpc_b64 s[30:31]
store <32 x i8> %arg0, ptr addrspace(1) null
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 5dff660912e40..81eac63ae5bdf 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -8,158 +8,115 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: v_writelane_b32 v5, s30, 0
; CHECK-NEXT: v_writelane_b32 v5, s31, 1
-; CHECK-NEXT: v_writelane_b32 v5, s36, 2
-; CHECK-NEXT: v_writelane_b32 v5, s37, 3
-; CHECK-NEXT: v_writelane_b32 v5, s38, 4
-; CHECK-NEXT: v_writelane_b32 v5, s39, 5
-; CHECK-NEXT: v_writelane_b32 v5, s40, 6
-; CHECK-NEXT: v_writelane_b32 v5, s41, 7
-; CHECK-NEXT: v_writelane_b32 v5, s42, 8
-; CHECK-NEXT: v_writelane_b32 v5, s43, 9
-; CHECK-NEXT: v_writelane_b32 v5, s44, 10
-; CHECK-NEXT: v_writelane_b32 v5, s45, 11
-; CHECK-NEXT: v_writelane_b32 v5, s46, 12
-; CHECK-NEXT: v_writelane_b32 v5, s47, 13
-; CHECK-NEXT: v_writelane_b32 v5, s48, 14
-; CHECK-NEXT: v_writelane_b32 v5, s49, 15
+; CHECK-NEXT: v_writelane_b32 v5, s34, 2
+; CHECK-NEXT: v_writelane_b32 v5, s35, 3
+; CHECK-NEXT: v_writelane_b32 v5, s36, 4
+; CHECK-NEXT: v_writelane_b32 v5, s37, 5
+; CHECK-NEXT: v_writelane_b32 v5, s38, 6
+; CHECK-NEXT: v_writelane_b32 v5, s39, 7
+; CHECK-NEXT: v_writelane_b32 v5, s40, 8
+; CHECK-NEXT: v_writelane_b32 v5, s41, 9
+; CHECK-NEXT: v_writelane_b32 v5, s42, 10
+; CHECK-NEXT: v_writelane_b32 v5, s43, 11
+; CHECK-NEXT: v_writelane_b32 v5, s44, 12
+; CHECK-NEXT: v_writelane_b32 v5, s45, 13
+; CHECK-NEXT: v_writelane_b32 v5, s46, 14
; CHECK-NEXT: s_getpc_b64 s[24:25]
-; CHECK-NEXT: v_writelane_b32 v5, s50, 16
-; CHECK-NEXT: s_movk_i32 s4, 0xf0
-; CHECK-NEXT: s_mov_b32 s5, s24
-; CHECK-NEXT: v_writelane_b32 v5, s51, 17
-; CHECK-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
-; CHECK-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
-; CHECK-NEXT: s_mov_b64 s[4:5], 0
-; CHECK-NEXT: s_load_dwordx4 s[28:31], s[4:5], 0x0
-; CHECK-NEXT: s_movk_i32 s20, 0x130
+; CHECK-NEXT: v_writelane_b32 v5, s47, 15
+; CHECK-NEXT: s_movk_i32 s20, 0xf0
; CHECK-NEXT: s_mov_b32 s21, s24
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_writelane_b32 v7, s36, 0
-; CHECK-NEXT: v_writelane_b32 v7, s37, 1
-; CHECK-NEXT: v_writelane_b32 v7, s38, 2
-; CHECK-NEXT: v_writelane_b32 v7, s39, 3
-; CHECK-NEXT: v_writelane_b32 v7, s40, 4
-; CHECK-NEXT: v_writelane_b32 v7, s41, 5
-; CHECK-NEXT: v_writelane_b32 v7, s42, 6
-; CHECK-NEXT: v_writelane_b32 v7, s43, 7
-; CHECK-NEXT: v_writelane_b32 v7, s44, 8
-; CHECK-NEXT: v_writelane_b32 v7, s45, 9
-; CHECK-NEXT: v_writelane_b32 v7, s46, 10
+; CHECK-NEXT: v_writelane_b32 v5, s48, 16
; CHECK-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0
-; CHECK-NEXT: v_writelane_b32 v7, s47, 11
-; CHECK-NEXT: v_writelane_b32 v7, s48, 12
-; CHECK-NEXT: s_mov_b32 s20, 0
+; CHECK-NEXT: s_mov_b64 s[20:21], 0
+; CHECK-NEXT: v_writelane_b32 v5, s49, 17
+; CHECK-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0
+; CHECK-NEXT: v_writelane_b32 v5, s50, 18
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_movk_i32 s22, 0x130
+; CHECK-NEXT: s_mov_b32 s23, s24
+; CHECK-NEXT: v_writelane_b32 v5, s51, 19
+; CHECK-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0x0
+; CHECK-NEXT: s_mov_b32 s28, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: v_writelane_b32 v7, s49, 13
-; CHECK-NEXT: v_mov_b32_e32 v2, s28
+; CHECK-NEXT: v_mov_b32_e32 v2, s20
; CHECK-NEXT: v_mov_b32_e32 v3, v1
-; CHECK-NEXT: s_mov_b32 s21, s20
-; CHECK-NEXT: s_mov_b32 s22, s20
-; CHECK-NEXT: s_mov_b32 s23, s20
-; CHECK-NEXT: v_writelane_b32 v7, s50, 14
-; CHECK-NEXT: v_writelane_b32 v7, s51, 15
-; CHECK-NEXT: image_sample_lz v3, v[2:3], s[44:51], s[20:23] dmask:0x1
+; CHECK-NEXT: s_mov_b32 s29, s28
+; CHECK-NEXT: s_mov_b32 s30, s28
+; CHECK-NEXT: s_mov_b32 s31, s28
+; CHECK-NEXT: image_sample_lz v3, v[2:3], s[12:19], s[28:31] dmask:0x1
; CHECK-NEXT: v_mov_b32_e32 v2, v1
+; CHECK-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
+; CHECK-NEXT: v_writelane_b32 v5, s52, 20
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_writelane_b32 v7, s4, 16
-; CHECK-NEXT: v_writelane_b32 v7, s5, 17
-; CHECK-NEXT: v_writelane_b32 v7, s6, 18
-; CHECK-NEXT: v_writelane_b32 v7, s7, 19
-; CHECK-NEXT: v_writelane_b32 v7, s8, 20
-; CHECK-NEXT: v_writelane_b32 v7, s9, 21
-; CHECK-NEXT: image_sample_lz v4, v[1:2], s[4:11], s[20:23] dmask:0x1
-; CHECK-NEXT: v_writelane_b32 v7, s10, 22
-; CHECK-NEXT: v_writelane_b32 v7, s11, 23
-; CHECK-NEXT: v_writelane_b32 v7, s12, 24
-; CHECK-NEXT: v_writelane_b32 v7, s13, 25
-; CHECK-NEXT: v_writelane_b32 v7, s14, 26
-; CHECK-NEXT: v_writelane_b32 v7, s15, 27
-; CHECK-NEXT: v_writelane_b32 v5, s52, 18
-; CHECK-NEXT: v_writelane_b32 v7, s16, 28
-; CHECK-NEXT: v_writelane_b32 v5, s53, 19
-; CHECK-NEXT: v_writelane_b32 v7, s17, 29
-; CHECK-NEXT: v_writelane_b32 v5, s54, 20
-; CHECK-NEXT: v_writelane_b32 v7, s18, 30
-; CHECK-NEXT: s_mov_b32 s26, 48
-; CHECK-NEXT: s_mov_b32 s27, s24
-; CHECK-NEXT: v_writelane_b32 v5, s55, 21
-; CHECK-NEXT: v_writelane_b32 v7, s19, 31
-; CHECK-NEXT: s_load_dwordx8 s[4:11], s[26:27], 0x0
-; CHECK-NEXT: v_writelane_b32 v5, s56, 22
-; CHECK-NEXT: v_writelane_b32 v5, s57, 23
-; CHECK-NEXT: v_writelane_b32 v5, s58, 24
-; CHECK-NEXT: v_writelane_b32 v5, s59, 25
-; CHECK-NEXT: v_writelane_b32 v5, s60, 26
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_writelane_b32 v7, s4, 32
-; CHECK-NEXT: v_writelane_b32 v5, s61, 27
-; CHECK-NEXT: v_writelane_b32 v7, s5, 33
-; CHECK-NEXT: v_writelane_b32 v5, s62, 28
-; CHECK-NEXT: v_writelane_b32 v7, s6, 34
-; CHECK-NEXT: v_writelane_b32 v5, s63, 29
-; CHECK-NEXT: v_writelane_b32 v7, s7, 35
-; CHECK-NEXT: v_writelane_b32 v5, s64, 30
-; CHECK-NEXT: v_writelane_b32 v7, s8, 36
-; CHECK-NEXT: v_writelane_b32 v5, s65, 31
-; CHECK-NEXT: v_writelane_b32 v7, s9, 37
-; CHECK-NEXT: v_writelane_b32 v5, s66, 32
-; CHECK-NEXT: s_movk_i32 s28, 0x1f0
-; CHECK-NEXT: s_movk_i32 s30, 0x2f0
-; CHECK-NEXT: s_mov_b32 s29, s24
-; CHECK-NEXT: s_mov_b32 s31, s24
-; CHECK-NEXT: v_writelane_b32 v7, s10, 38
-; CHECK-NEXT: v_writelane_b32 v5, s67, 33
-; CHECK-NEXT: v_writelane_b32 v7, s11, 39
-; CHECK-NEXT: s_load_dwordx16 s[52:67], s[28:29], 0x0
-; CHECK-NEXT: s_load_dwordx16 s[4:19], s[30:31], 0x0
+; CHECK-NEXT: v_writelane_b32 v6, s36, 0
+; CHECK-NEXT: v_writelane_b32 v5, s53, 21
+; CHECK-NEXT: v_writelane_b32 v5, s54, 22
+; CHECK-NEXT: v_writelane_b32 v5, s55, 23
+; CHECK-NEXT: v_writelane_b32 v5, s56, 24
+; CHECK-NEXT: v_writelane_b32 v6, s37, 1
+; CHECK-NEXT: image_sample_lz v4, v[1:2], s[36:43], s[28:31] dmask:0x1
+; CHECK-NEXT: v_writelane_b32 v5, s57, 25
+; CHECK-NEXT: v_writelane_b32 v6, s38, 2
+; CHECK-NEXT: v_writelane_b32 v5, s58, 26
+; CHECK-NEXT: v_writelane_b32 v6, s39, 3
+; CHECK-NEXT: v_writelane_b32 v5, s59, 27
+; CHECK-NEXT: v_writelane_b32 v6, s40, 4
+; CHECK-NEXT: v_writelane_b32 v5, s60, 28
+; CHECK-NEXT: v_writelane_b32 v6, s41, 5
+; CHECK-NEXT: v_writelane_b32 v5, s61, 29
+; CHECK-NEXT: v_writelane_b32 v6, s42, 6
+; CHECK-NEXT: v_writelane_b32 v5, s62, 30
+; CHECK-NEXT: v_writelane_b32 v6, s43, 7
+; CHECK-NEXT: v_writelane_b32 v5, s63, 31
+; CHECK-NEXT: v_writelane_b32 v6, s44, 8
+; CHECK-NEXT: v_writelane_b32 v5, s64, 32
+; CHECK-NEXT: v_writelane_b32 v6, s45, 9
+; CHECK-NEXT: v_writelane_b32 v5, s65, 33
+; CHECK-NEXT: v_writelane_b32 v6, s46, 10
+; CHECK-NEXT: v_writelane_b32 v5, s66, 34
+; CHECK-NEXT: v_writelane_b32 v6, s47, 11
+; CHECK-NEXT: v_writelane_b32 v5, s67, 35
+; CHECK-NEXT: v_writelane_b32 v6, s48, 12
+; CHECK-NEXT: v_writelane_b32 v5, s68, 36
+; CHECK-NEXT: v_writelane_b32 v6, s49, 13
+; CHECK-NEXT: v_writelane_b32 v5, s69, 37
+; CHECK-NEXT: v_writelane_b32 v6, s50, 14
+; CHECK-NEXT: s_mov_b32 s34, 48
+; CHECK-NEXT: s_movk_i32 s52, 0x1f0
+; CHECK-NEXT: s_movk_i32 s68, 0x2f0
+; CHECK-NEXT: s_mov_b32 s35, s24
+; CHECK-NEXT: s_mov_b32 s53, s24
+; CHECK-NEXT: s_mov_b32 s69, s24
+; CHECK-NEXT: v_writelane_b32 v6, s51, 15
+; CHECK-NEXT: s_load_dwordx8 s[20:27], s[34:35], 0x0
+; CHECK-NEXT: s_load_dwordx16 s[36:51], s[52:53], 0x0
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT: s_load_dwordx16 s[52:67], s[68:69], 0x0
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT: s_xor_b64 s[24:25], vcc, -1
+; CHECK-NEXT: v_writelane_b32 v5, s70, 38
+; CHECK-NEXT: s_xor_b64 s[34:35], vcc, -1
+; CHECK-NEXT: v_writelane_b32 v5, s71, 39
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mul_f32_e32 v0, v4, v3
-; CHECK-NEXT: s_and_saveexec_b64 s[26:27], s[24:25]
-; CHECK-NEXT: s_xor_b64 s[26:27], exec, s[26:27]
+; CHECK-NEXT: s_and_saveexec_b64 vcc, s[34:35]
+; CHECK-NEXT: s_xor_b64 s[68:69], exec, vcc
; CHECK-NEXT: s_cbranch_execz .LBB0_3
; CHECK-NEXT: ; %bb.1: ; %bb48
-; CHECK-NEXT: v_readlane_b32 s36, v7, 0
-; CHECK-NEXT: v_readlane_b32 s44, v7, 8
-; CHECK-NEXT: v_readlane_b32 s45, v7, 9
-; CHECK-NEXT: v_readlane_b32 s46, v7, 10
-; CHECK-NEXT: v_readlane_b32 s47, v7, 11
-; CHECK-NEXT: v_readlane_b32 s48, v7, 12
-; CHECK-NEXT: v_readlane_b32 s49, v7, 13
-; CHECK-NEXT: v_readlane_b32 s50, v7, 14
-; CHECK-NEXT: v_readlane_b32 s51, v7, 15
-; CHECK-NEXT: s_and_b64 vcc, exec, -1
-; CHECK-NEXT: v_readlane_b32 s37, v7, 1
-; CHECK-NEXT: v_readlane_b32 s38, v7, 2
-; CHECK-NEXT: v_readlane_b32 s39, v7, 3
-; CHECK-NEXT: v_readlane_b32 s40, v7, 4
-; CHECK-NEXT: image_sample_lz v3, v[1:2], s[44:51], s[20:23] dmask:0x1
+; CHECK-NEXT: image_sample_lz v3, v[1:2], s[12:19], s[28:31] dmask:0x1
; CHECK-NEXT: v_mov_b32_e32 v2, 0
-; CHECK-NEXT: v_readlane_b32 s41, v7, 5
-; CHECK-NEXT: v_readlane_b32 s42, v7, 6
-; CHECK-NEXT: v_readlane_b32 s43, v7, 7
+; CHECK-NEXT: s_and_b64 vcc, exec, -1
; CHECK-NEXT: .LBB0_2: ; %bb50
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: v_readlane_b32 s36, v7, 32
-; CHECK-NEXT: v_readlane_b32 s40, v7, 36
-; CHECK-NEXT: v_readlane_b32 s41, v7, 37
-; CHECK-NEXT: v_readlane_b32 s42, v7, 38
-; CHECK-NEXT: v_readlane_b32 s43, v7, 39
-; CHECK-NEXT: s_mov_b32 s21, s20
-; CHECK-NEXT: s_mov_b32 s22, s20
-; CHECK-NEXT: s_mov_b32 s23, s20
-; CHECK-NEXT: v_readlane_b32 s37, v7, 33
-; CHECK-NEXT: v_readlane_b32 s38, v7, 34
+; CHECK-NEXT: s_mov_b32 s29, s28
+; CHECK-NEXT: s_mov_b32 s30, s28
+; CHECK-NEXT: s_mov_b32 s31, s28
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: image_sample_lz v4, v[1:2], s[60:67], s[40:43] dmask:0x1
-; CHECK-NEXT: v_readlane_b32 s39, v7, 35
-; CHECK-NEXT: image_sample_lz v1, v[1:2], s[12:19], s[20:23] dmask:0x1
+; CHECK-NEXT: image_sample_lz v4, v[1:2], s[44:51], s[24:27] dmask:0x1
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[28:31] dmask:0x1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_sub_f32_e32 v1, v1, v4
; CHECK-NEXT: v_mul_f32_e32 v1, v1, v0
@@ -167,159 +124,69 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: s_mov_b64 vcc, vcc
; CHECK-NEXT: s_cbranch_vccnz .LBB0_2
; CHECK-NEXT: .LBB0_3: ; %Flow14
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_readlane_b32 s12, v7, 32
-; CHECK-NEXT: v_readlane_b32 s13, v7, 33
-; CHECK-NEXT: v_readlane_b32 s14, v7, 34
-; CHECK-NEXT: v_readlane_b32 s15, v7, 35
-; CHECK-NEXT: v_readlane_b32 s16, v7, 36
-; CHECK-NEXT: v_readlane_b32 s17, v7, 37
-; CHECK-NEXT: v_readlane_b32 s18, v7, 38
-; CHECK-NEXT: v_readlane_b32 s19, v7, 39
-; CHECK-NEXT: v_writelane_b32 v7, s4, 40
-; CHECK-NEXT: v_writelane_b32 v7, s5, 41
-; CHECK-NEXT: v_writelane_b32 v7, s6, 42
-; CHECK-NEXT: v_writelane_b32 v7, s7, 43
-; CHECK-NEXT: v_writelane_b32 v7, s8, 44
-; CHECK-NEXT: v_writelane_b32 v7, s9, 45
-; CHECK-NEXT: v_writelane_b32 v7, s10, 46
-; CHECK-NEXT: v_writelane_b32 v7, s11, 47
-; CHECK-NEXT: v_writelane_b32 v7, s12, 48
-; CHECK-NEXT: v_writelane_b32 v7, s13, 49
-; CHECK-NEXT: v_writelane_b32 v7, s14, 50
-; CHECK-NEXT: v_writelane_b32 v7, s15, 51
-; CHECK-NEXT: v_writelane_b32 v7, s16, 52
-; CHECK-NEXT: v_writelane_b32 v7, s17, 53
-; CHECK-NEXT: v_writelane_b32 v7, s18, 54
-; CHECK-NEXT: v_writelane_b32 v7, s19, 55
-; CHECK-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
-; CHECK-NEXT: v_writelane_b32 v7, s52, 56
-; CHECK-NEXT: v_writelane_b32 v6, s60, 0
-; CHECK-NEXT: v_writelane_b32 v7, s53, 57
-; CHECK-NEXT: v_writelane_b32 v6, s61, 1
-; CHECK-NEXT: v_writelane_b32 v7, s54, 58
-; CHECK-NEXT: v_writelane_b32 v6, s62, 2
-; CHECK-NEXT: v_writelane_b32 v7, s55, 59
-; CHECK-NEXT: v_writelane_b32 v6, s63, 3
-; CHECK-NEXT: v_writelane_b32 v7, s56, 60
-; CHECK-NEXT: v_writelane_b32 v6, s64, 4
-; CHECK-NEXT: v_writelane_b32 v7, s57, 61
-; CHECK-NEXT: v_writelane_b32 v6, s65, 5
-; CHECK-NEXT: v_writelane_b32 v7, s58, 62
-; CHECK-NEXT: v_writelane_b32 v6, s66, 6
-; CHECK-NEXT: v_writelane_b32 v7, s59, 63
-; CHECK-NEXT: v_writelane_b32 v6, s67, 7
-; CHECK-NEXT: s_andn2_saveexec_b64 s[20:21], s[26:27]
+; CHECK-NEXT: s_andn2_saveexec_b64 s[12:13], s[68:69]
; CHECK-NEXT: s_cbranch_execz .LBB0_10
; CHECK-NEXT: ; %bb.4: ; %bb32
-; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[24:25]
-; CHECK-NEXT: s_xor_b64 s[22:23], exec, s[8:9]
+; CHECK-NEXT: s_and_saveexec_b64 s[14:15], s[34:35]
+; CHECK-NEXT: s_xor_b64 s[14:15], exec, s[14:15]
; CHECK-NEXT: s_cbranch_execz .LBB0_6
; CHECK-NEXT: ; %bb.5: ; %bb43
-; CHECK-NEXT: s_mov_b32 s8, 0
-; CHECK-NEXT: s_mov_b32 s9, s8
-; CHECK-NEXT: v_mov_b32_e32 v0, s8
-; CHECK-NEXT: v_readlane_b32 s36, v7, 0
-; CHECK-NEXT: v_mov_b32_e32 v1, s9
-; CHECK-NEXT: s_mov_b32 s10, s8
-; CHECK-NEXT: s_mov_b32 s11, s8
-; CHECK-NEXT: v_readlane_b32 s37, v7, 1
-; CHECK-NEXT: v_readlane_b32 s38, v7, 2
-; CHECK-NEXT: v_readlane_b32 s39, v7, 3
-; CHECK-NEXT: v_readlane_b32 s40, v7, 4
-; CHECK-NEXT: v_readlane_b32 s41, v7, 5
-; CHECK-NEXT: v_readlane_b32 s42, v7, 6
-; CHECK-NEXT: v_readlane_b32 s43, v7, 7
-; CHECK-NEXT: v_readlane_b32 s44, v7, 8
-; CHECK-NEXT: v_readlane_b32 s45, v7, 9
-; CHECK-NEXT: v_readlane_b32 s46, v7, 10
-; CHECK-NEXT: v_readlane_b32 s47, v7, 11
-; CHECK-NEXT: v_readlane_b32 s48, v7, 12
-; CHECK-NEXT: v_readlane_b32 s49, v7, 13
-; CHECK-NEXT: v_readlane_b32 s50, v7, 14
-; CHECK-NEXT: v_readlane_b32 s51, v7, 15
-; CHECK-NEXT: image_sample_lz v2, v[0:1], s[36:43], s[8:11] dmask:0x1
-; CHECK-NEXT: v_readlane_b32 s36, v7, 16
-; CHECK-NEXT: v_readlane_b32 s44, v7, 24
-; CHECK-NEXT: v_readlane_b32 s45, v7, 25
-; CHECK-NEXT: v_readlane_b32 s46, v7, 26
-; CHECK-NEXT: v_readlane_b32 s47, v7, 27
-; CHECK-NEXT: v_readlane_b32 s48, v7, 28
-; CHECK-NEXT: v_readlane_b32 s49, v7, 29
-; CHECK-NEXT: v_readlane_b32 s50, v7, 30
-; CHECK-NEXT: v_readlane_b32 s51, v7, 31
-; CHECK-NEXT: v_mov_b32_e32 v3, 0
-; CHECK-NEXT: v_mov_b32_e32 v4, v3
-; CHECK-NEXT: v_readlane_b32 s37, v7, 17
-; CHECK-NEXT: v_readlane_b32 s38, v7, 18
-; CHECK-NEXT: v_readlane_b32 s39, v7, 19
-; CHECK-NEXT: image_sample_lz v0, v[0:1], s[44:51], s[12:15] dmask:0x1
-; CHECK-NEXT: v_readlane_b32 s40, v7, 20
-; CHECK-NEXT: v_readlane_b32 s41, v7, 21
-; CHECK-NEXT: v_readlane_b32 s42, v7, 22
-; CHECK-NEXT: v_readlane_b32 s43, v7, 23
+; CHECK-NEXT: s_mov_b32 s16, 0
+; CHECK-NEXT: s_mov_b32 s17, s16
+; CHECK-NEXT: v_mov_b32_e32 v2, s16
+; CHECK-NEXT: v_mov_b32_e32 v3, s17
+; CHECK-NEXT: s_mov_b32 s18, s16
+; CHECK-NEXT: s_mov_b32 s19, s16
+; CHECK-NEXT: image_sample_lz v1, v[2:3], s[4:11], s[16:19] dmask:0x1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_mov_b64 s[4:5], s[36:37]
+; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[8:9], s[40:41]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[42:43]
+; CHECK-NEXT: v_readlane_b32 s36, v6, 0
+; CHECK-NEXT: v_readlane_b32 s44, v6, 8
+; CHECK-NEXT: v_readlane_b32 s45, v6, 9
+; CHECK-NEXT: v_readlane_b32 s46, v6, 10
+; CHECK-NEXT: v_readlane_b32 s47, v6, 11
+; CHECK-NEXT: v_readlane_b32 s48, v6, 12
+; CHECK-NEXT: v_readlane_b32 s49, v6, 13
+; CHECK-NEXT: v_readlane_b32 s50, v6, 14
+; CHECK-NEXT: v_readlane_b32 s51, v6, 15
+; CHECK-NEXT: v_readlane_b32 s37, v6, 1
+; CHECK-NEXT: v_readlane_b32 s38, v6, 2
+; CHECK-NEXT: v_readlane_b32 s39, v6, 3
+; CHECK-NEXT: v_readlane_b32 s40, v6, 4
+; CHECK-NEXT: v_readlane_b32 s41, v6, 5
+; CHECK-NEXT: image_sample_lz v0, v[2:3], s[44:51], s[20:23] dmask:0x1
+; CHECK-NEXT: v_readlane_b32 s42, v6, 6
+; CHECK-NEXT: v_readlane_b32 s43, v6, 7
+; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: s_mov_b64 s[42:43], s[10:11]
+; CHECK-NEXT: v_mov_b32_e32 v3, v2
+; CHECK-NEXT: s_mov_b64 s[40:41], s[8:9]
+; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
+; CHECK-NEXT: s_mov_b64 s[36:37], s[4:5]
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_dwordx3 v[2:4], off, s[8:11], 0
+; CHECK-NEXT: buffer_store_dwordx3 v[1:3], off, s[16:19], 0
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: .LBB0_6: ; %Flow12
-; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[22:23]
-; CHECK-NEXT: v_readlane_b32 s52, v7, 40
-; CHECK-NEXT: v_readlane_b32 s53, v7, 41
-; CHECK-NEXT: v_readlane_b32 s54, v7, 42
-; CHECK-NEXT: v_readlane_b32 s55, v7, 43
-; CHECK-NEXT: v_readlane_b32 s56, v7, 44
-; CHECK-NEXT: v_readlane_b32 s57, v7, 45
-; CHECK-NEXT: v_readlane_b32 s58, v7, 46
-; CHECK-NEXT: v_readlane_b32 s59, v7, 47
-; CHECK-NEXT: v_readlane_b32 s60, v7, 48
-; CHECK-NEXT: v_readlane_b32 s61, v7, 49
-; CHECK-NEXT: v_readlane_b32 s62, v7, 50
-; CHECK-NEXT: v_readlane_b32 s63, v7, 51
-; CHECK-NEXT: v_readlane_b32 s64, v7, 52
-; CHECK-NEXT: v_readlane_b32 s65, v7, 53
-; CHECK-NEXT: v_readlane_b32 s66, v7, 54
-; CHECK-NEXT: v_readlane_b32 s67, v7, 55
-; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[14:15]
; CHECK-NEXT: s_cbranch_execz .LBB0_9
; CHECK-NEXT: ; %bb.7: ; %bb33.preheader
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: s_mov_b32 s6, s8
; CHECK-NEXT: s_mov_b32 s7, s8
; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: v_readlane_b32 s36, v7, 56
; CHECK-NEXT: s_mov_b32 s9, s8
; CHECK-NEXT: s_mov_b32 s10, s8
; CHECK-NEXT: s_mov_b32 s11, s8
; CHECK-NEXT: v_mov_b32_e32 v2, s7
-; CHECK-NEXT: v_readlane_b32 s37, v7, 57
-; CHECK-NEXT: v_readlane_b32 s38, v7, 58
-; CHECK-NEXT: v_readlane_b32 s39, v7, 59
-; CHECK-NEXT: v_readlane_b32 s40, v7, 60
-; CHECK-NEXT: v_readlane_b32 s41, v7, 61
-; CHECK-NEXT: v_readlane_b32 s42, v7, 62
-; CHECK-NEXT: v_readlane_b32 s43, v7, 63
-; CHECK-NEXT: s_nop 4
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: image_sample_lz v3, v[1:2], s[36:43], s[8:11] dmask:0x1
; CHECK-NEXT: image_sample_lz v4, v[1:2], s[52:59], s[8:11] dmask:0x1
-; CHECK-NEXT: ; kill: killed $vgpr1_vgpr2
-; CHECK-NEXT: s_mov_b64 s[12:13], s[36:37]
; CHECK-NEXT: s_and_b64 vcc, exec, 0
-; CHECK-NEXT: v_readlane_b32 s44, v6, 0
-; CHECK-NEXT: v_readlane_b32 s45, v6, 1
-; CHECK-NEXT: v_readlane_b32 s46, v6, 2
-; CHECK-NEXT: v_readlane_b32 s47, v6, 3
-; CHECK-NEXT: v_readlane_b32 s48, v6, 4
-; CHECK-NEXT: v_readlane_b32 s49, v6, 5
-; CHECK-NEXT: v_readlane_b32 s50, v6, 6
-; CHECK-NEXT: v_readlane_b32 s51, v6, 7
-; CHECK-NEXT: s_mov_b64 s[14:15], s[38:39]
-; CHECK-NEXT: s_mov_b64 s[16:17], s[40:41]
-; CHECK-NEXT: s_mov_b64 s[18:19], s[42:43]
-; CHECK-NEXT: ; kill: killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-; CHECK-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11
-; CHECK-NEXT: ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_sub_f32_e32 v1, v4, v3
; CHECK-NEXT: v_mul_f32_e32 v0, v1, v0
@@ -333,45 +200,51 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: .LBB0_9: ; %Flow13
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
; CHECK-NEXT: .LBB0_10: ; %UnifiedReturnBlock
-; CHECK-NEXT: s_or_b64 exec, exec, s[20:21]
-; CHECK-NEXT: v_readlane_b32 s67, v5, 33
-; CHECK-NEXT: v_readlane_b32 s66, v5, 32
-; CHECK-NEXT: v_readlane_b32 s65, v5, 31
-; CHECK-NEXT: v_readlane_b32 s64, v5, 30
-; CHECK-NEXT: v_readlane_b32 s63, v5, 29
-; CHECK-NEXT: v_readlane_b32 s62, v5, 28
-; CHECK-NEXT: v_readlane_b32 s61, v5, 27
-; CHECK-NEXT: v_readlane_b32 s60, v5, 26
-; CHECK-NEXT: v_readlane_b32 s59, v5, 25
-; CHECK-NEXT: v_readlane_b32 s58, v5, 24
-; CHECK-NEXT: v_readlane_b32 s57, v5, 23
-; CHECK-NEXT: v_readlane_b32 s56, v5, 22
-; CHECK-NEXT: v_readlane_b32 s55, v5, 21
-; CHECK-NEXT: v_readlane_b32 s54, v5, 20
-; CHECK-NEXT: v_readlane_b32 s53, v5, 19
-; CHECK-NEXT: v_readlane_b32 s52, v5, 18
-; CHECK-NEXT: v_readlane_b32 s51, v5, 17
-; CHECK-NEXT: v_readlane_b32 s50, v5, 16
-; CHECK-NEXT: v_readlane_b32 s49, v5, 15
-; CHECK-NEXT: v_readlane_b32 s48, v5, 14
-; CHECK-NEXT: v_readlane_b32 s47, v5, 13
-; CHECK-NEXT: v_readlane_b32 s46, v5, 12
-; CHECK-NEXT: v_readlane_b32 s45, v5, 11
-; CHECK-NEXT: v_readlane_b32 s44, v5, 10
-; CHECK-NEXT: v_readlane_b32 s43, v5, 9
-; CHECK-NEXT: v_readlane_b32 s42, v5, 8
-; CHECK-NEXT: v_readlane_b32 s41, v5, 7
-; CHECK-NEXT: v_readlane_b32 s40, v5, 6
-; CHECK-NEXT: v_readlane_b32 s39, v5, 5
-; CHECK-NEXT: v_readlane_b32 s38, v5, 4
-; CHECK-NEXT: v_readlane_b32 s37, v5, 3
-; CHECK-NEXT: v_readlane_b32 s36, v5, 2
+; CHECK-NEXT: s_or_b64 exec, exec, s[12:13]
+; CHECK-NEXT: v_readlane_b32 s71, v5, 39
+; CHECK-NEXT: v_readlane_b32 s70, v5, 38
+; CHECK-NEXT: v_readlane_b32 s69, v5, 37
+; CHECK-NEXT: v_readlane_b32 s68, v5, 36
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_readlane_b32 s67, v5, 35
+; CHECK-NEXT: v_readlane_b32 s66, v5, 34
+; CHECK-NEXT: v_readlane_b32 s65, v5, 33
+; CHECK-NEXT: v_readlane_b32 s64, v5, 32
+; CHECK-NEXT: v_readlane_b32 s63, v5, 31
+; CHECK-NEXT: v_readlane_b32 s62, v5, 30
+; CHECK-NEXT: v_readlane_b32 s61, v5, 29
+; CHECK-NEXT: v_readlane_b32 s60, v5, 28
+; CHECK-NEXT: v_readlane_b32 s59, v5, 27
+; CHECK-NEXT: v_readlane_b32 s58, v5, 26
+; CHECK-NEXT: v_readlane_b32 s57, v5, 25
+; CHECK-NEXT: v_readlane_b32 s56, v5, 24
+; CHECK-NEXT: v_readlane_b32 s55, v5, 23
+; CHECK-NEXT: v_readlane_b32 s54, v5, 22
+; CHECK-NEXT: v_readlane_b32 s53, v5, 21
+; CHECK-NEXT: v_readlane_b32 s52, v5, 20
+; CHECK-NEXT: v_readlane_b32 s51, v5, 19
+; CHECK-NEXT: v_readlane_b32 s50, v5, 18
+; CHECK-NEXT: v_readlane_b32 s49, v5, 17
+; CHECK-NEXT: v_readlane_b32 s48, v5, 16
+; CHECK-NEXT: v_readlane_b32 s47, v5, 15
+; CHECK-NEXT: v_readlane_b32 s46, v5, 14
+; CHECK-NEXT: v_readlane_b32 s45, v5, 13
+; CHECK-NEXT: v_readlane_b32 s44, v5, 12
+; CHECK-NEXT: v_readlane_b32 s43, v5, 11
+; CHECK-NEXT: v_readlane_b32 s42, v5, 10
+; CHECK-NEXT: v_readlane_b32 s41, v5, 9
+; CHECK-NEXT: v_readlane_b32 s40, v5, 8
+; CHECK-NEXT: v_readlane_b32 s39, v5, 7
+; CHECK-NEXT: v_readlane_b32 s38, v5, 6
+; CHECK-NEXT: v_readlane_b32 s37, v5, 5
+; CHECK-NEXT: v_readlane_b32 s36, v5, 4
+; CHECK-NEXT: v_readlane_b32 s35, v5, 3
+; CHECK-NEXT: v_readlane_b32 s34, v5, 2
; CHECK-NEXT: v_readlane_b32 s31, v5, 1
; CHECK-NEXT: v_readlane_b32 s30, v5, 0
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index dd29970af52fd..6c472123ee766 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -404,20 +404,20 @@ entry:
define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
; GFX7-LABEL: idot4_acc8:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b32 s6, -1
+; GFX7-NEXT: buffer_load_ubyte v1, off, s[4:7], 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
@@ -433,7 +433,7 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: idot4_acc8:
@@ -1402,20 +1402,20 @@ entry:
define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX7-LABEL: idot4_acc32_3ele:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8
@@ -1423,12 +1423,12 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_i32 v4, v0, 0, 8
; GFX7-NEXT: v_bfe_i32 v5, v0, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v1, v1, v4, s4
+; GFX7-NEXT: v_mad_i32_i24 v1, v1, v4, s0
; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8
; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX7-NEXT: v_mad_i32_i24 v1, v3, v5, v1
; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: idot4_acc32_3ele:
@@ -1589,20 +1589,20 @@ entry:
define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX7-LABEL: idot4_acc32_3ele_permuted:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_ashrrev_i32_e32 v1, 24, v2
; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8
@@ -1610,12 +1610,12 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX7-NEXT: v_ashrrev_i32_e32 v4, 24, v0
; GFX7-NEXT: v_bfe_i32 v5, v0, 0, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v1, v1, v4, s4
+; GFX7-NEXT: v_mad_i32_i24 v1, v1, v4, s0
; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8
; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX7-NEXT: v_mad_i32_i24 v1, v3, v5, v1
; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: idot4_acc32_3ele_permuted:
@@ -1775,19 +1775,19 @@ entry:
define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX7-LABEL: idot4_acc32_opt:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
@@ -1802,7 +1802,7 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0
; GFX7-NEXT: v_mad_i32_i24 v1, v4, v7, v1
; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: idot4_acc32_opt:
@@ -2572,20 +2572,20 @@ entry:
define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1,
; GFX7-LABEL: idot4_commutative:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xf
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xf
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8
@@ -2593,12 +2593,12 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_i32 v4, v0, 0, 8
; GFX7-NEXT: v_bfe_i32 v5, v0, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v1, v1, v4, s4
+; GFX7-NEXT: v_mad_i32_i24 v1, v1, v4, s0
; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8
; GFX7-NEXT: v_mad_i32_i24 v1, v3, v5, v1
; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: idot4_commutative:
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 3a97724d81fbe..0693119af4b31 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -194,20 +194,20 @@ entry:
define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
; GFX7-LABEL: udot4_acc16:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b32 s6, -1
+; GFX7-NEXT: buffer_load_ushort v1, off, s[4:7], 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
@@ -223,7 +223,7 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: udot4_acc16:
@@ -390,20 +390,20 @@ entry:
define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
; GFX7-LABEL: udot4_acc8:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b32 s6, -1
+; GFX7-NEXT: buffer_load_ubyte v1, off, s[4:7], 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
@@ -419,7 +419,7 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: udot4_acc8:
@@ -732,20 +732,20 @@ entry:
define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX7-LABEL: udot4_CommutationInsideMAD:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b32 s6, -1
+; GFX7-NEXT: buffer_load_ubyte v1, off, s[4:7], 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
@@ -761,7 +761,7 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v8, v5, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
-; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: udot4_CommutationInsideMAD:
@@ -910,20 +910,20 @@ entry:
define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
; GFX7-LABEL: udot4_CommutationAccrossMADs:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b32 s6, -1
+; GFX7-NEXT: buffer_load_ubyte v1, off, s[4:7], 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2
@@ -939,7 +939,7 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v8, v5, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
-; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: udot4_CommutationAccrossMADs:
@@ -2434,20 +2434,20 @@ entry:
define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX7-LABEL: udot4_acc8_vecMul:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b32 s6, -1
+; GFX7-NEXT: buffer_load_ubyte v1, off, s[4:7], 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8
@@ -2463,7 +2463,7 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0
-; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: udot4_acc8_vecMul:
@@ -2840,20 +2840,20 @@ entry:
define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX7-LABEL: idot4_acc32_3ele:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
@@ -2861,12 +2861,12 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0
; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, s4
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, s0
; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v5, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: idot4_acc32_3ele:
@@ -3025,20 +3025,20 @@ entry:
define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX7-LABEL: idot4_acc32_3ele_permuted:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2
@@ -3046,12 +3046,12 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0
; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, s4
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, s0
; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v5, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: idot4_acc32_3ele_permuted:
@@ -3211,19 +3211,19 @@ entry:
define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX7-LABEL: idot4_acc32_opt:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
@@ -3238,7 +3238,7 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: idot4_acc32_opt:
@@ -4008,20 +4008,20 @@ entry:
define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1,
; GFX7-LABEL: udot4_commutative:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xf
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xf
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
@@ -4029,12 +4029,12 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1,
; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0
; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, s4
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, s0
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v5, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: udot4_commutative:
@@ -5080,19 +5080,19 @@ entry:
define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1,
; GFX7-LABEL: idot4_acc32_lohi:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4
-; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4
+; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
@@ -5107,7 +5107,7 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1,
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: idot4_acc32_lohi:
@@ -5266,19 +5266,19 @@ entry:
define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
; GFX7-LABEL: idot4_acc32_hihi:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:4
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4
-; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4
+; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_u32 v3, v2, 16, 8
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
@@ -5293,7 +5293,7 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: idot4_acc32_hihi:
@@ -5610,22 +5610,22 @@ entry:
define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
; GFX7-LABEL: idot4_acc32_v16i8:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v0
; GFX7-NEXT: v_mov_b32_e32 v2, 0
-; GFX7-NEXT: s_mov_b64 s[8:9], s[10:11]
-; GFX7-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX7-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX7-NEXT: s_mov_b64 s[2:3], s[10:11]
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[4:7], 0 addr64
-; GFX7-NEXT: buffer_load_dword v0, v[4:5], s[8:11], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[8:11], 0 addr64
+; GFX7-NEXT: buffer_load_dword v0, v[4:5], s[0:3], 0 addr64
+; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
@@ -5640,7 +5640,7 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v6, v7, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v1
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: idot4_acc32_v16i8:
@@ -5809,21 +5809,21 @@ entry:
define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
; GFX7-LABEL: idot4_acc32_v256i8:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0
; GFX7-NEXT: v_mov_b32_e32 v2, 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: s_mov_b64 s[2:3], s[10:11]
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_load_dword v0, v[1:2], s[8:11], 0 addr64 offset:252
-; GFX7-NEXT: buffer_load_dword v1, v[3:4], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v0, v[1:2], s[0:3], 0 addr64 offset:252
+; GFX7-NEXT: buffer_load_dword v1, v[3:4], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_u32 v4, v0, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -5838,7 +5838,7 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1
; GFX7-NEXT: v_mad_u32_u24 v2, v6, v7, v2
; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, v2
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: idot4_acc32_v256i8:
diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index 069bebdf3c469..50f0a39802270 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -2555,19 +2555,19 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
;
; GFX9-LABEL: udot8_acc8_vecMul:
; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT: global_load_ubyte v4, v3, s[6:7]
; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_mov_b32 s14, -1
; GFX9-NEXT: s_mov_b32 s15, 0xe00000
; GFX9-NEXT: s_add_u32 s12, s12, s11
-; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
-; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
-; GFX9-NEXT: global_load_ubyte v4, v3, s[0:1]
; GFX9-NEXT: s_addc_u32 s13, s13, 0
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_bfe_u32 v0, v1, 4, 4
@@ -2615,24 +2615,24 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX9-NEXT: v_add_u16_e32 v0, v0, v8
; GFX9-NEXT: v_mad_legacy_u16 v0, v9, v16, v0
; GFX9-NEXT: v_add_u16_e32 v0, v0, v7
-; GFX9-NEXT: global_store_byte v3, v0, s[0:1]
+; GFX9-NEXT: global_store_byte v3, v0, s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot8_acc8_vecMul:
; GFX9-DL: ; %bb.0: ; %entry
+; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[6:7]
; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-DL-NEXT: s_mov_b32 s14, -1
; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
; GFX9-DL-NEXT: s_add_u32 s12, s12, s11
-; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
-; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11]
-; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[0:1]
; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
; GFX9-DL-NEXT: v_bfe_u32 v0, v1, 4, 4
@@ -2680,7 +2680,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8
; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v9, v16, v0
; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v7
-; GFX9-DL-NEXT: global_store_byte v3, v0, s[0:1]
+; GFX9-DL-NEXT: global_store_byte v3, v0, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot8_acc8_vecMul:
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index 74020c43a3ca3..0adae5e50cda7 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -8043,29 +8043,29 @@ define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out
;
; SI-MOVREL-LABEL: extract_largest_inbounds_offset:
; SI-MOVREL: ; %bb.0: ; %entry
-; SI-MOVREL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
+; SI-MOVREL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-MOVREL-NEXT: s_load_dword s12, s[4:5], 0xd
-; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
-; SI-MOVREL-NEXT: s_mov_b32 s2, -1
-; SI-MOVREL-NEXT: s_mov_b32 s6, s2
+; SI-MOVREL-NEXT: s_mov_b32 s7, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s6, -1
+; SI-MOVREL-NEXT: s_mov_b32 s10, s6
; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; SI-MOVREL-NEXT: s_mov_b32 s4, s10
-; SI-MOVREL-NEXT: s_mov_b32 s5, s11
-; SI-MOVREL-NEXT: s_mov_b32 s7, s3
-; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc
+; SI-MOVREL-NEXT: s_mov_b32 s8, s2
+; SI-MOVREL-NEXT: s_mov_b32 s9, s3
+; SI-MOVREL-NEXT: s_mov_b32 s11, s7
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
-; SI-MOVREL-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 glc
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 glc
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
-; SI-MOVREL-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 glc
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 glc
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
-; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 glc
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT: s_add_i32 s12, s12, 15
; SI-MOVREL-NEXT: s_mov_b32 m0, s12
-; SI-MOVREL-NEXT: s_mov_b32 s0, s8
-; SI-MOVREL-NEXT: s_mov_b32 s1, s9
+; SI-MOVREL-NEXT: s_mov_b32 s4, s0
+; SI-MOVREL-NEXT: s_mov_b32 s5, s1
; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0
-; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-MOVREL-NEXT: s_endpgm
;
; VI-MOVREL-LABEL: extract_largest_inbounds_offset:
@@ -8307,29 +8307,29 @@ define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, p
;
; SI-MOVREL-LABEL: extract_out_of_bounds_offset:
; SI-MOVREL: ; %bb.0: ; %entry
-; SI-MOVREL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
+; SI-MOVREL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-MOVREL-NEXT: s_load_dword s12, s[4:5], 0xd
-; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
-; SI-MOVREL-NEXT: s_mov_b32 s2, -1
-; SI-MOVREL-NEXT: s_mov_b32 s6, s2
+; SI-MOVREL-NEXT: s_mov_b32 s7, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s6, -1
+; SI-MOVREL-NEXT: s_mov_b32 s10, s6
; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; SI-MOVREL-NEXT: s_mov_b32 s4, s10
-; SI-MOVREL-NEXT: s_mov_b32 s5, s11
-; SI-MOVREL-NEXT: s_mov_b32 s7, s3
-; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc
+; SI-MOVREL-NEXT: s_mov_b32 s8, s2
+; SI-MOVREL-NEXT: s_mov_b32 s9, s3
+; SI-MOVREL-NEXT: s_mov_b32 s11, s7
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
-; SI-MOVREL-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 glc
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 glc
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
-; SI-MOVREL-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 glc
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 glc
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
-; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 glc
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; SI-MOVREL-NEXT: s_add_i32 s12, s12, 16
; SI-MOVREL-NEXT: s_mov_b32 m0, s12
-; SI-MOVREL-NEXT: s_mov_b32 s0, s8
-; SI-MOVREL-NEXT: s_mov_b32 s1, s9
+; SI-MOVREL-NEXT: s_mov_b32 s4, s0
+; SI-MOVREL-NEXT: s_mov_b32 s5, s1
; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0
-; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-MOVREL-NEXT: s_endpgm
;
; VI-MOVREL-LABEL: extract_out_of_bounds_offset:
@@ -8572,29 +8572,29 @@ define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out,
;
; SI-MOVREL-LABEL: extractelement_v16i32_or_index:
; SI-MOVREL: ; %bb.0: ; %entry
-; SI-MOVREL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
+; SI-MOVREL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-MOVREL-NEXT: s_load_dword s12, s[4:5], 0xd
-; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
-; SI-MOVREL-NEXT: s_mov_b32 s2, -1
-; SI-MOVREL-NEXT: s_mov_b32 s6, s2
+; SI-MOVREL-NEXT: s_mov_b32 s7, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s6, -1
+; SI-MOVREL-NEXT: s_mov_b32 s10, s6
; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; SI-MOVREL-NEXT: s_mov_b32 s4, s10
-; SI-MOVREL-NEXT: s_mov_b32 s5, s11
-; SI-MOVREL-NEXT: s_mov_b32 s7, s3
-; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc
+; SI-MOVREL-NEXT: s_mov_b32 s8, s2
+; SI-MOVREL-NEXT: s_mov_b32 s9, s3
+; SI-MOVREL-NEXT: s_mov_b32 s11, s7
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
-; SI-MOVREL-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 glc
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 glc
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
-; SI-MOVREL-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 glc
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 glc
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
-; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 glc
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc
; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
-; SI-MOVREL-NEXT: s_lshl_b32 s4, s12, 2
-; SI-MOVREL-NEXT: s_mov_b32 m0, s4
-; SI-MOVREL-NEXT: s_mov_b32 s0, s8
-; SI-MOVREL-NEXT: s_mov_b32 s1, s9
+; SI-MOVREL-NEXT: s_mov_b32 s4, s0
+; SI-MOVREL-NEXT: s_lshl_b32 s0, s12, 2
+; SI-MOVREL-NEXT: s_mov_b32 m0, s0
+; SI-MOVREL-NEXT: s_mov_b32 s5, s1
; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v1
-; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-MOVREL-NEXT: s_endpgm
;
; VI-MOVREL-LABEL: extractelement_v16i32_or_index:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 72cda5c718f5b..f213b0635c8ac 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -677,25 +677,25 @@ define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8
;
; VI-LABEL: dynamic_insertelement_v8f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x20
-; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_load_dword s4, s[8:9], 0x40
+; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20
+; VI-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
+; VI-NEXT: s_load_dword s8, s[8:9], 0x40
; VI-NEXT: v_mov_b32_e32 v8, 0x40a00000
-; VI-NEXT: s_mov_b32 s3, 0x1100f000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s15, 0x1100f000
+; VI-NEXT: s_mov_b32 s14, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s12
-; VI-NEXT: v_mov_b32_e32 v1, s13
-; VI-NEXT: v_mov_b32_e32 v2, s14
-; VI-NEXT: v_mov_b32_e32 v3, s15
-; VI-NEXT: v_mov_b32_e32 v4, s16
-; VI-NEXT: v_mov_b32_e32 v5, s17
-; VI-NEXT: v_mov_b32_e32 v6, s18
-; VI-NEXT: v_mov_b32_e32 v7, s19
-; VI-NEXT: s_mov_b32 m0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
+; VI-NEXT: s_mov_b32 m0, s8
; VI-NEXT: v_movreld_b32_e32 v0, v8
-; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; VI-NEXT: s_endpgm
%vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
store <8 x float> %vecins, ptr addrspace(1) %out, align 32
@@ -705,54 +705,55 @@ define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8
define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 x float> %a, i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v9f32:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10
-; SI-NEXT: s_load_dword s4, s[8:9], 0x18
-; SI-NEXT: s_load_dword s5, s[8:9], 0x20
+; SI-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
+; SI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
+; SI-NEXT: s_load_dword s10, s[8:9], 0x18
+; SI-NEXT: s_load_dword s8, s[8:9], 0x20
; SI-NEXT: v_mov_b32_e32 v9, 0x40a00000
-; SI-NEXT: s_mov_b32 s3, 0x100f000
+; SI-NEXT: s_mov_b32 s15, 0x100f000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s12
-; SI-NEXT: v_mov_b32_e32 v1, s13
-; SI-NEXT: v_mov_b32_e32 v2, s14
-; SI-NEXT: v_mov_b32_e32 v3, s15
-; SI-NEXT: v_mov_b32_e32 v4, s16
-; SI-NEXT: v_mov_b32_e32 v5, s17
-; SI-NEXT: v_mov_b32_e32 v6, s18
-; SI-NEXT: v_mov_b32_e32 v7, s19
-; SI-NEXT: v_mov_b32_e32 v8, s4
-; SI-NEXT: s_mov_b32 m0, s5
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_mov_b32_e32 v2, s2
+; SI-NEXT: v_mov_b32_e32 v3, s3
+; SI-NEXT: v_mov_b32_e32 v4, s4
+; SI-NEXT: v_mov_b32_e32 v5, s5
+; SI-NEXT: v_mov_b32_e32 v6, s6
+; SI-NEXT: v_mov_b32_e32 v7, s7
+; SI-NEXT: v_mov_b32_e32 v8, s10
+; SI-NEXT: s_mov_b32 m0, s8
+; SI-NEXT: s_mov_b32 s14, -1
; SI-NEXT: v_movreld_b32_e32 v0, v9
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32
-; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:32
+; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: dynamic_insertelement_v9f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40
-; VI-NEXT: s_load_dword s4, s[8:9], 0x60
-; VI-NEXT: s_load_dword s5, s[8:9], 0x80
-; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x40
+; VI-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v9, 0x40a00000
+; VI-NEXT: s_mov_b32 s15, 0x1100f000
+; VI-NEXT: s_mov_b32 s14, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s12
-; VI-NEXT: v_mov_b32_e32 v1, s13
-; VI-NEXT: v_mov_b32_e32 v2, s14
-; VI-NEXT: v_mov_b32_e32 v3, s15
-; VI-NEXT: v_mov_b32_e32 v4, s16
-; VI-NEXT: v_mov_b32_e32 v5, s17
-; VI-NEXT: v_mov_b32_e32 v6, s18
-; VI-NEXT: v_mov_b32_e32 v7, s19
-; VI-NEXT: v_mov_b32_e32 v8, s4
-; VI-NEXT: s_mov_b32 m0, s5
-; VI-NEXT: s_mov_b32 s3, 0x1100f000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_load_dword s0, s[8:9], 0x60
+; VI-NEXT: s_load_dword s1, s[8:9], 0x80
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v8, s0
+; VI-NEXT: s_mov_b32 m0, s1
; VI-NEXT: v_movreld_b32_e32 v0, v9
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32
-; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:32
+; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; VI-NEXT: s_endpgm
%vecins = insertelement <9 x float> %a, float 5.000000e+00, i32 %b
store <9 x float> %vecins, ptr addrspace(1) %out, align 32
@@ -762,29 +763,29 @@ define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9
define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, <10 x float> %a, i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v10f32:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10
-; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18
-; SI-NEXT: s_load_dword s6, s[8:9], 0x20
+; SI-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
+; SI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
+; SI-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x18
+; SI-NEXT: s_load_dword s8, s[8:9], 0x20
; SI-NEXT: v_mov_b32_e32 v10, 0x40a00000
-; SI-NEXT: s_mov_b32 s3, 0x100f000
+; SI-NEXT: s_mov_b32 s15, 0x100f000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s12
-; SI-NEXT: v_mov_b32_e32 v1, s13
-; SI-NEXT: v_mov_b32_e32 v2, s14
-; SI-NEXT: v_mov_b32_e32 v3, s15
-; SI-NEXT: v_mov_b32_e32 v4, s16
-; SI-NEXT: v_mov_b32_e32 v5, s17
-; SI-NEXT: v_mov_b32_e32 v6, s18
-; SI-NEXT: v_mov_b32_e32 v7, s19
-; SI-NEXT: v_mov_b32_e32 v8, s4
-; SI-NEXT: v_mov_b32_e32 v9, s5
-; SI-NEXT: s_mov_b32 m0, s6
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_mov_b32_e32 v2, s2
+; SI-NEXT: v_mov_b32_e32 v3, s3
+; SI-NEXT: v_mov_b32_e32 v4, s4
+; SI-NEXT: v_mov_b32_e32 v5, s5
+; SI-NEXT: v_mov_b32_e32 v6, s6
+; SI-NEXT: v_mov_b32_e32 v7, s7
+; SI-NEXT: v_mov_b32_e32 v8, s10
+; SI-NEXT: v_mov_b32_e32 v9, s11
+; SI-NEXT: s_mov_b32 m0, s8
+; SI-NEXT: s_mov_b32 s14, -1
; SI-NEXT: v_movreld_b32_e32 v0, v10
-; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; SI-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
+; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
+; SI-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
; SI-NEXT: s_endpgm
;
; VI-LABEL: dynamic_insertelement_v10f32:
@@ -1175,24 +1176,24 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8
;
; VI-LABEL: dynamic_insertelement_v8i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x20
-; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_load_dword s4, s[8:9], 0x40
-; VI-NEXT: s_mov_b32 s3, 0x1100f000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20
+; VI-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
+; VI-NEXT: s_load_dword s8, s[8:9], 0x40
+; VI-NEXT: s_mov_b32 s15, 0x1100f000
+; VI-NEXT: s_mov_b32 s14, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s12
-; VI-NEXT: v_mov_b32_e32 v1, s13
-; VI-NEXT: v_mov_b32_e32 v2, s14
-; VI-NEXT: v_mov_b32_e32 v3, s15
-; VI-NEXT: v_mov_b32_e32 v4, s16
-; VI-NEXT: v_mov_b32_e32 v5, s17
-; VI-NEXT: v_mov_b32_e32 v6, s18
-; VI-NEXT: v_mov_b32_e32 v7, s19
-; VI-NEXT: s_mov_b32 m0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
+; VI-NEXT: s_mov_b32 m0, s8
; VI-NEXT: v_movreld_b32_e32 v0, 5
-; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; VI-NEXT: s_endpgm
%vecins = insertelement <8 x i32> %a, i32 5, i32 %b
store <8 x i32> %vecins, ptr addrspace(1) %out, align 32
@@ -1202,52 +1203,53 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8
define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 x i32> %a, i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v9i32:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10
-; SI-NEXT: s_load_dword s4, s[8:9], 0x18
-; SI-NEXT: s_load_dword s5, s[8:9], 0x20
-; SI-NEXT: s_mov_b32 s3, 0x100f000
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
+; SI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
+; SI-NEXT: s_load_dword s10, s[8:9], 0x18
+; SI-NEXT: s_load_dword s8, s[8:9], 0x20
+; SI-NEXT: s_mov_b32 s15, 0x100f000
+; SI-NEXT: s_mov_b32 s14, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s12
-; SI-NEXT: v_mov_b32_e32 v1, s13
-; SI-NEXT: v_mov_b32_e32 v2, s14
-; SI-NEXT: v_mov_b32_e32 v3, s15
-; SI-NEXT: v_mov_b32_e32 v4, s16
-; SI-NEXT: v_mov_b32_e32 v5, s17
-; SI-NEXT: v_mov_b32_e32 v6, s18
-; SI-NEXT: v_mov_b32_e32 v7, s19
-; SI-NEXT: v_mov_b32_e32 v8, s4
-; SI-NEXT: s_mov_b32 m0, s5
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_mov_b32_e32 v2, s2
+; SI-NEXT: v_mov_b32_e32 v3, s3
+; SI-NEXT: v_mov_b32_e32 v4, s4
+; SI-NEXT: v_mov_b32_e32 v5, s5
+; SI-NEXT: v_mov_b32_e32 v6, s6
+; SI-NEXT: v_mov_b32_e32 v7, s7
+; SI-NEXT: v_mov_b32_e32 v8, s10
+; SI-NEXT: s_mov_b32 m0, s8
; SI-NEXT: v_movreld_b32_e32 v0, 5
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32
-; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:32
+; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: dynamic_insertelement_v9i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40
-; VI-NEXT: s_load_dword s4, s[8:9], 0x60
-; VI-NEXT: s_load_dword s5, s[8:9], 0x80
-; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_mov_b32 s3, 0x1100f000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x40
+; VI-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
+; VI-NEXT: s_mov_b32 s15, 0x1100f000
+; VI-NEXT: s_mov_b32 s14, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s12
-; VI-NEXT: v_mov_b32_e32 v1, s13
-; VI-NEXT: v_mov_b32_e32 v2, s14
-; VI-NEXT: v_mov_b32_e32 v3, s15
-; VI-NEXT: v_mov_b32_e32 v4, s16
-; VI-NEXT: v_mov_b32_e32 v5, s17
-; VI-NEXT: v_mov_b32_e32 v6, s18
-; VI-NEXT: v_mov_b32_e32 v7, s19
-; VI-NEXT: v_mov_b32_e32 v8, s4
-; VI-NEXT: s_mov_b32 m0, s5
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_load_dword s0, s[8:9], 0x60
+; VI-NEXT: s_load_dword s1, s[8:9], 0x80
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v8, s0
+; VI-NEXT: s_mov_b32 m0, s1
; VI-NEXT: v_movreld_b32_e32 v0, 5
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32
-; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:32
+; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; VI-NEXT: s_endpgm
%vecins = insertelement <9 x i32> %a, i32 5, i32 %b
store <9 x i32> %vecins, ptr addrspace(1) %out, align 32
@@ -1257,28 +1259,28 @@ define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9
define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, <10 x i32> %a, i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v10i32:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10
-; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18
-; SI-NEXT: s_load_dword s6, s[8:9], 0x20
-; SI-NEXT: s_mov_b32 s3, 0x100f000
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
+; SI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
+; SI-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x18
+; SI-NEXT: s_load_dword s8, s[8:9], 0x20
+; SI-NEXT: s_mov_b32 s15, 0x100f000
+; SI-NEXT: s_mov_b32 s14, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s12
-; SI-NEXT: v_mov_b32_e32 v1, s13
-; SI-NEXT: v_mov_b32_e32 v2, s14
-; SI-NEXT: v_mov_b32_e32 v3, s15
-; SI-NEXT: v_mov_b32_e32 v4, s16
-; SI-NEXT: v_mov_b32_e32 v5, s17
-; SI-NEXT: v_mov_b32_e32 v6, s18
-; SI-NEXT: v_mov_b32_e32 v7, s19
-; SI-NEXT: v_mov_b32_e32 v8, s4
-; SI-NEXT: v_mov_b32_e32 v9, s5
-; SI-NEXT: s_mov_b32 m0, s6
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_mov_b32_e32 v2, s2
+; SI-NEXT: v_mov_b32_e32 v3, s3
+; SI-NEXT: v_mov_b32_e32 v4, s4
+; SI-NEXT: v_mov_b32_e32 v5, s5
+; SI-NEXT: v_mov_b32_e32 v6, s6
+; SI-NEXT: v_mov_b32_e32 v7, s7
+; SI-NEXT: v_mov_b32_e32 v8, s10
+; SI-NEXT: v_mov_b32_e32 v9, s11
+; SI-NEXT: s_mov_b32 m0, s8
; SI-NEXT: v_movreld_b32_e32 v0, 5
-; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; SI-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
+; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
+; SI-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
; SI-NEXT: s_endpgm
;
; VI-LABEL: dynamic_insertelement_v10i32:
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 121891adef182..f4dba49d4bcdf 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -4577,10 +4577,9 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xb
; SI-NEXT: s_load_dword s9, s[4:5], 0xf
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11
-; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s1, s0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -4716,8 +4715,7 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; SI-NEXT: buffer_load_ubyte v6, off, s[4:7], 0 offset:51
; SI-NEXT: buffer_load_ubyte v7, off, s[4:7], 0 offset:52
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53
-; SI-NEXT: s_mov_b32 s4, 0
-; SI-NEXT: s_mov_b32 s5, s4
+; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
@@ -4895,10 +4893,9 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8,
; SI-NEXT: s_load_dword s13, s[4:5], 0xf
; SI-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x11
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x15
-; SI-NEXT: s_mov_b32 s4, 0
+; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s5, s4
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index 9377da89a6b05..37f335561a52c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
@@ -625,102 +625,102 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_u32_e32 v1, s0, v0
-; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112
-; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96
-; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80
-; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64
-; GCN-NEXT: ds_read_b128 a[0:3], v1
-; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16
-; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32
-; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48
-; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:8304
-; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:8288
-; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:8272
-; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:8256
-; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:8240
-; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:8224
-; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:8208
-; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:8192
+; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112
+; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96
+; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80
+; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64
+; GCN-NEXT: ds_read_b128 a[128:131], v1
+; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16
+; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32
+; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48
+; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304
+; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288
+; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272
+; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256
+; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240
+; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224
+; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208
+; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192
; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1
-; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688
-; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672
-; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656
-; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640
-; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624
-; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608
-; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592
-; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576
-; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:49264
-; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:49248
-; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:49232
-; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:49216
-; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:49200
-; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:49184
-; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:49168
-; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:49152
+; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688
+; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672
+; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656
+; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640
+; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624
+; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608
+; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592
+; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576
+; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264
+; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248
+; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232
+; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216
+; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200
+; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184
+; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168
+; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152
; GCN-NEXT: v_mov_b32_e32 v1, 1.0
-; GCN-NEXT: ds_read_b128 a[156:159], v2 offset:57456
-; GCN-NEXT: ds_read_b128 a[152:155], v2 offset:57440
-; GCN-NEXT: ds_read_b128 a[148:151], v2 offset:57424
-; GCN-NEXT: ds_read_b128 a[144:147], v2 offset:57408
-; GCN-NEXT: ds_read_b128 a[128:131], v2 offset:57344
-; GCN-NEXT: ds_read_b128 a[132:135], v2 offset:57360
-; GCN-NEXT: ds_read_b128 a[136:139], v2 offset:57376
-; GCN-NEXT: ds_read_b128 a[140:143], v2 offset:57392
+; GCN-NEXT: ds_read_b128 a[60:63], v2 offset:57456
+; GCN-NEXT: ds_read_b128 a[56:59], v2 offset:57440
+; GCN-NEXT: ds_read_b128 a[52:55], v2 offset:57424
+; GCN-NEXT: ds_read_b128 a[48:51], v2 offset:57408
+; GCN-NEXT: ds_read_b128 a[32:35], v2 offset:57344
+; GCN-NEXT: ds_read_b128 a[36:39], v2 offset:57360
+; GCN-NEXT: ds_read_b128 a[40:43], v2 offset:57376
+; GCN-NEXT: ds_read_b128 a[44:47], v2 offset:57392
; GCN-NEXT: v_mov_b32_e32 v2, 2.0
; GCN-NEXT: v_add_u32_e32 v0, s1, v0
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
; GCN-NEXT: s_waitcnt lgkmcnt(14)
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159]
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
-; GCN-NEXT: s_waitcnt lgkmcnt(8)
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127]
+; GCN-NEXT: s_waitcnt lgkmcnt(8)
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159]
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 4
-; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112
-; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96
-; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80
-; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64
-; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48
-; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32
-; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16
-; GCN-NEXT: ds_write_b128 v0, a[0:3]
+; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112
+; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96
+; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80
+; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:64
+; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:48
+; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32
+; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16
+; GCN-NEXT: ds_write_b128 v0, a[128:131]
; GCN-NEXT: v_mov_b32_e32 v0, s1
-; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:8288
-; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:8304
-; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:8256
-; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:8272
-; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:8224
-; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:8240
-; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:8192
-; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:8208
-; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480
-; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496
-; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448
-; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464
-; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416
-; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432
-; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384
-; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400
-; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:24672
-; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:24688
-; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:24640
-; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:24656
-; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:24608
-; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:24624
-; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:24576
-; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:24592
-; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:32864
-; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:32880
-; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:32832
-; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:32848
-; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32800
-; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:32816
-; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:32768
-; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:32784
+; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:8288
+; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:8304
+; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:8256
+; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:8272
+; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:8224
+; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:8240
+; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:8192
+; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:8208
+; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:16480
+; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:16496
+; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:16448
+; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:16464
+; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:16416
+; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:16432
+; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:16384
+; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16400
+; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:24672
+; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:24688
+; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:24640
+; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:24656
+; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:24608
+; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:24624
+; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:24576
+; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:24592
+; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:32864
+; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:32880
+; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:32832
+; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:32848
+; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:32800
+; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:32816
+; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:32768
+; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:32784
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(40) SyncID(0)
; GCN-NEXT: s_endpgm
@@ -732,102 +732,102 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0
-; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112
-; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96
-; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80
-; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:64
-; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1
-; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:16
-; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:32
-; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:48
-; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v1 offset:8304
-; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v1 offset:8288
-; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v1 offset:8272
-; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v1 offset:8256
-; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v1 offset:8240
-; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v1 offset:8224
-; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v1 offset:8208
-; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v1 offset:8192
+; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v1 offset:112
+; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v1 offset:96
+; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v1 offset:80
+; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v1 offset:64
+; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v1
+; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v1 offset:16
+; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v1 offset:32
+; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v1 offset:48
+; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304
+; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288
+; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272
+; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256
+; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240
+; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224
+; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208
+; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192
; EXACTCUTOFF-NEXT: v_add_u32_e32 v2, 0x6000, v1
-; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:24688
-; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:24672
-; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:24656
-; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:24640
-; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v1 offset:24624
-; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v1 offset:24608
-; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v1 offset:24592
-; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v1 offset:24576
-; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:49264
-; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:49248
-; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:49232
-; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:49216
-; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:49200
-; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:49184
-; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:49168
-; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:49152
+; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:24688
+; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:24672
+; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:24656
+; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:24640
+; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:24624
+; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:24608
+; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:24592
+; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:24576
+; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:49264
+; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:49248
+; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:49232
+; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:49216
+; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v1 offset:49200
+; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v1 offset:49184
+; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v1 offset:49168
+; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v1 offset:49152
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, 1.0
-; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v2 offset:57456
-; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v2 offset:57440
-; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v2 offset:57424
-; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v2 offset:57408
-; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v2 offset:57344
-; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v2 offset:57360
-; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v2 offset:57376
-; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v2 offset:57392
+; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v2 offset:57456
+; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v2 offset:57440
+; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v2 offset:57424
+; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v2 offset:57408
+; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v2 offset:57344
+; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v2 offset:57360
+; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v2 offset:57376
+; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v2 offset:57392
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 2.0
; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14)
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159]
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
-; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8)
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127]
+; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8)
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159]
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 4
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:96
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:80
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:64
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:48
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3]
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:112
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:96
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:80
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[144:147] offset:64
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[140:143] offset:48
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[136:139] offset:32
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[132:135] offset:16
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[128:131]
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:8288
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:8304
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:8256
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[52:55] offset:8272
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[40:43] offset:8224
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:8240
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] offset:8192
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:8208
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[88:91] offset:16480
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[92:95] offset:16496
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[80:83] offset:16448
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[84:87] offset:16464
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[72:75] offset:16416
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[76:79] offset:16432
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[64:67] offset:16384
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[68:71] offset:16400
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:24672
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:24688
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:24640
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:24656
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:24608
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:24624
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] offset:24576
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:24592
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:32864
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:32880
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[144:147] offset:32832
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:32848
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[136:139] offset:32800
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[140:143] offset:32816
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[128:131] offset:32768
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[132:135] offset:32784
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:8288
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:8304
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:8256
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:8272
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:8224
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:8240
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:8192
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:8208
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:16480
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:16496
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:16448
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:16464
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:16416
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:16432
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] offset:16384
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:16400
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[88:91] offset:24672
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[92:95] offset:24688
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[80:83] offset:24640
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[84:87] offset:24656
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[72:75] offset:24608
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[76:79] offset:24624
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[64:67] offset:24576
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[68:71] offset:24592
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:32864
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:32880
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:32832
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[52:55] offset:32848
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[40:43] offset:32800
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:32816
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] offset:32768
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:32784
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(40) SyncID(0)
; EXACTCUTOFF-NEXT: s_endpgm
@@ -872,68 +872,68 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0x1ff80, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
; GCN-NEXT: v_mov_b32_e32 v2, 1.0
; GCN-NEXT: v_mov_b32_e32 v3, 2.0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_add_u32_e32 v0, s0, v1
-; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:112
-; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:96
-; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:80
-; GCN-NEXT: ds_read_b128 a[16:19], v0 offset:64
-; GCN-NEXT: ds_read_b128 a[0:3], v0
-; GCN-NEXT: ds_read_b128 a[4:7], v0 offset:16
-; GCN-NEXT: ds_read_b128 a[8:11], v0 offset:32
-; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:48
+; GCN-NEXT: v_add_u32_e32 v1, s0, v0
+; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112
+; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96
+; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80
+; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64
+; GCN-NEXT: ds_read_b128 a[0:3], v1
+; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16
+; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32
+; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-NEXT: v_add_u32_e32 v1, s1, v1
+; GCN-NEXT: v_add_u32_e32 v0, s1, v0
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 1
-; GCN-NEXT: ds_write_b128 v1, a[28:31] offset:112
-; GCN-NEXT: ds_write_b128 v1, a[24:27] offset:96
-; GCN-NEXT: ds_write_b128 v1, a[20:23] offset:80
-; GCN-NEXT: ds_write_b128 v1, a[16:19] offset:64
-; GCN-NEXT: ds_write_b128 v1, a[12:15] offset:48
-; GCN-NEXT: ds_write_b128 v1, a[8:11] offset:32
-; GCN-NEXT: ds_write_b128 v1, a[4:7] offset:16
-; GCN-NEXT: ds_write_b128 v1, a[0:3]
-; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:8304
-; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:8288
-; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:8272
-; GCN-NEXT: ds_read_b128 a[16:19], v0 offset:8256
-; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:8240
-; GCN-NEXT: ds_read_b128 a[8:11], v0 offset:8224
-; GCN-NEXT: ds_read_b128 a[4:7], v0 offset:8208
-; GCN-NEXT: ds_read_b128 a[0:3], v0 offset:8192
+; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112
+; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96
+; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80
+; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64
+; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48
+; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32
+; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16
+; GCN-NEXT: ds_write_b128 v0, a[0:3]
+; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304
+; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288
+; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272
+; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256
+; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240
+; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224
+; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208
+; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 1
-; GCN-NEXT: ds_write_b128 v1, a[24:27] offset:8288
-; GCN-NEXT: ds_write_b128 v1, a[28:31] offset:8304
-; GCN-NEXT: ds_write_b128 v1, a[16:19] offset:8256
-; GCN-NEXT: ds_write_b128 v1, a[20:23] offset:8272
-; GCN-NEXT: ds_write_b128 v1, a[8:11] offset:8224
-; GCN-NEXT: ds_write_b128 v1, a[12:15] offset:8240
-; GCN-NEXT: ds_write_b128 v1, a[0:3] offset:8192
-; GCN-NEXT: ds_write_b128 v1, a[4:7] offset:8208
-; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:24688
-; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:24672
-; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:24656
-; GCN-NEXT: ds_read_b128 a[16:19], v0 offset:24640
-; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:24624
-; GCN-NEXT: ds_read_b128 a[8:11], v0 offset:24608
-; GCN-NEXT: ds_read_b128 a[4:7], v0 offset:24592
-; GCN-NEXT: ds_read_b128 a[0:3], v0 offset:24576
+; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:8288
+; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:8304
+; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:8256
+; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:8272
+; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:8224
+; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:8240
+; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:8192
+; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:8208
+; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:24688
+; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:24672
+; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:24656
+; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:24640
+; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:24624
+; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:24608
+; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:24592
+; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:24576
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
@@ -942,47 +942,47 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 2
-; GCN-NEXT: ds_write_b128 v1, a[24:27] offset:16480
-; GCN-NEXT: ds_write_b128 v1, a[28:31] offset:16496
-; GCN-NEXT: ds_write_b128 v1, a[16:19] offset:16448
-; GCN-NEXT: ds_write_b128 v1, a[20:23] offset:16464
-; GCN-NEXT: ds_write_b128 v1, a[8:11] offset:16416
-; GCN-NEXT: ds_write_b128 v1, a[12:15] offset:16432
-; GCN-NEXT: ds_write_b128 v1, a[0:3] offset:16384
-; GCN-NEXT: ds_write_b128 v1, a[4:7] offset:16400
-; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:49264
-; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:49248
-; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:49232
-; GCN-NEXT: ds_read_b128 a[16:19], v0 offset:49216
-; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:49200
-; GCN-NEXT: ds_read_b128 a[8:11], v0 offset:49184
-; GCN-NEXT: ds_read_b128 a[4:7], v0 offset:49168
-; GCN-NEXT: ds_read_b128 a[0:3], v0 offset:49152
+; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:16480
+; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:16496
+; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:16448
+; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:16464
+; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:16416
+; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:16432
+; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:16384
+; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16400
+; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:49264
+; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:49248
+; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:49232
+; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:49216
+; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:49200
+; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:49184
+; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:49168
+; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:49152
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-NEXT: v_add_u32_e32 v0, 0x6000, v0
+; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 1
-; GCN-NEXT: ds_write_b128 v1, a[24:27] offset:24672
-; GCN-NEXT: ds_write_b128 v1, a[28:31] offset:24688
-; GCN-NEXT: ds_write_b128 v1, a[16:19] offset:24640
-; GCN-NEXT: ds_write_b128 v1, a[20:23] offset:24656
-; GCN-NEXT: ds_write_b128 v1, a[8:11] offset:24608
-; GCN-NEXT: ds_write_b128 v1, a[12:15] offset:24624
-; GCN-NEXT: ds_write_b128 v1, a[0:3] offset:24576
-; GCN-NEXT: ds_write_b128 v1, a[4:7] offset:24592
-; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:57456
-; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:57440
-; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:57424
-; GCN-NEXT: ds_read_b128 a[16:19], v0 offset:57408
-; GCN-NEXT: ds_read_b128 a[0:3], v0 offset:57344
-; GCN-NEXT: ds_read_b128 a[4:7], v0 offset:57360
-; GCN-NEXT: ds_read_b128 a[8:11], v0 offset:57376
-; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:57392
+; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:24672
+; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:24688
+; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:24640
+; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:24656
+; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:24608
+; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:24624
+; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:24576
+; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:24592
+; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:57456
+; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:57440
+; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:57424
+; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:57408
+; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:57344
+; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:57360
+; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:57376
+; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:57392
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
@@ -991,14 +991,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 2
-; GCN-NEXT: ds_write_b128 v1, a[24:27] offset:32864
-; GCN-NEXT: ds_write_b128 v1, a[28:31] offset:32880
-; GCN-NEXT: ds_write_b128 v1, a[16:19] offset:32832
-; GCN-NEXT: ds_write_b128 v1, a[20:23] offset:32848
-; GCN-NEXT: ds_write_b128 v1, a[8:11] offset:32800
-; GCN-NEXT: ds_write_b128 v1, a[12:15] offset:32816
-; GCN-NEXT: ds_write_b128 v1, a[0:3] offset:32768
-; GCN-NEXT: ds_write_b128 v1, a[4:7] offset:32784
+; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864
+; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880
+; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832
+; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848
+; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800
+; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816
+; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768
+; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-NEXT: s_endpgm
;
@@ -1006,68 +1006,68 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; EXACTCUTOFF: ; %bb.0: ; %entry
; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0
-; EXACTCUTOFF-NEXT: v_and_b32_e32 v1, 0x1ff80, v0
+; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 2.0
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s0, v1
-; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:112
-; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:96
-; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:80
-; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v0 offset:64
-; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0
-; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v0 offset:16
-; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v0 offset:32
-; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:48
+; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0
+; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112
+; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96
+; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80
+; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:64
+; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1
+; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:16
+; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:32
+; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:48
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s1, v1
+; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 1
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[28:31] offset:112
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[24:27] offset:96
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[20:23] offset:80
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[16:19] offset:64
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[12:15] offset:48
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[8:11] offset:32
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[4:7] offset:16
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[0:3]
-; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:8304
-; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:8288
-; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:8272
-; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v0 offset:8256
-; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:8240
-; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v0 offset:8224
-; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v0 offset:8208
-; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 offset:8192
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:96
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:80
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:64
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:48
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3]
+; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304
+; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288
+; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272
+; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256
+; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240
+; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224
+; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208
+; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, s1
+; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 1
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[24:27] offset:8288
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[28:31] offset:8304
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[16:19] offset:8256
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[20:23] offset:8272
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[8:11] offset:8224
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[12:15] offset:8240
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[0:3] offset:8192
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[4:7] offset:8208
-; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:24688
-; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:24672
-; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:24656
-; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v0 offset:24640
-; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:24624
-; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v0 offset:24608
-; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v0 offset:24592
-; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 offset:24576
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:8288
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:8304
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:8256
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:8272
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:8224
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:8240
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:8192
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:8208
+; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:24688
+; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:24672
+; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:24656
+; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:24640
+; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:24624
+; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:24608
+; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:24592
+; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:24576
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
@@ -1076,47 +1076,47 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 2
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[24:27] offset:16480
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[28:31] offset:16496
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[16:19] offset:16448
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[20:23] offset:16464
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[8:11] offset:16416
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[12:15] offset:16432
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[0:3] offset:16384
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[4:7] offset:16400
-; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:49264
-; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:49248
-; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:49232
-; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v0 offset:49216
-; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:49200
-; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v0 offset:49184
-; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v0 offset:49168
-; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 offset:49152
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:16480
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:16496
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:16448
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:16464
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:16416
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:16432
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:16384
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16400
+; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:49264
+; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:49248
+; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:49232
+; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:49216
+; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:49200
+; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:49184
+; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:49168
+; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:49152
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, 0x6000, v0
+; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, 0x6000, v1
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 1
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[24:27] offset:24672
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[28:31] offset:24688
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[16:19] offset:24640
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[20:23] offset:24656
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[8:11] offset:24608
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[12:15] offset:24624
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[0:3] offset:24576
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[4:7] offset:24592
-; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:57456
-; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:57440
-; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:57424
-; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v0 offset:57408
-; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 offset:57344
-; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v0 offset:57360
-; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v0 offset:57376
-; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:57392
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:24672
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:24688
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:24640
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:24656
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:24608
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:24624
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:24576
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:24592
+; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:57456
+; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:57440
+; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:57424
+; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:57408
+; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:57344
+; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:57360
+; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:57376
+; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:57392
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
@@ -1125,14 +1125,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 2
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[24:27] offset:32864
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[28:31] offset:32880
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[16:19] offset:32832
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[20:23] offset:32848
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[8:11] offset:32800
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[12:15] offset:32816
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[0:3] offset:32768
-; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[4:7] offset:32784
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:32864
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:32880
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:32832
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:32848
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32800
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:32816
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:32768
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:32784
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: s_endpgm
entry:
@@ -1214,14 +1214,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5
; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
; GCN-NEXT: v_add_u32_e32 v1, s6, v0
-; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112
-; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96
-; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80
-; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64
-; GCN-NEXT: ds_read_b128 a[0:3], v1
-; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16
-; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32
-; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48
+; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:112
+; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:96
+; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:80
+; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:64
+; GCN-NEXT: ds_read_b128 a[96:99], v1
+; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:16
+; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:32
+; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:48
; GCN-NEXT: v_mov_b32_e32 v9, 1.0
; GCN-NEXT: v_ldexp_f32 v4, v4, v5
; GCN-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0
@@ -1236,19 +1236,19 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; GCN-NEXT: v_fma_f32 v10, s1, v3, -v10
; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GCN-NEXT: v_fmac_f32_e32 v10, s1, v7
-; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:8304
+; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304
; GCN-NEXT: s_waitcnt lgkmcnt(1)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v9, v4, a[0:31]
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v9, v4, a[96:127]
; GCN-NEXT: v_add_f32_e32 v4, v12, v10
; GCN-NEXT: v_exp_f32_e32 v4, v4
; GCN-NEXT: v_cvt_i32_f32_e32 v10, v11
-; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:8288
-; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:8272
-; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:8256
-; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:8240
-; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:8224
-; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:8208
-; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:8192
+; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288
+; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272
+; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256
+; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240
+; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224
+; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208
+; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192
; GCN-NEXT: v_ldexp_f32 v4, v4, v10
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v5
; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
@@ -1257,7 +1257,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; GCN-NEXT: v_mul_f32_e32 v10, s2, v3
; GCN-NEXT: v_rndne_f32_e32 v11, v10
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v9, v4, a[32:63]
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v9, v4, a[0:31]
; GCN-NEXT: v_fma_f32 v4, s2, v3, -v10
; GCN-NEXT: v_sub_f32_e32 v12, v10, v11
; GCN-NEXT: v_fmac_f32_e32 v4, s2, v7
@@ -1273,14 +1273,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592
; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576
; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1
-; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:49264
-; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:49248
-; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:49232
-; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:49216
-; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:49200
-; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:49184
-; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:49168
-; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:49152
+; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:49264
+; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:49248
+; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:49232
+; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:49216
+; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:49200
+; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:49184
+; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:49168
+; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:49152
; GCN-NEXT: v_ldexp_f32 v1, v4, v10
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1306,7 +1306,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
; GCN-NEXT: v_mul_f32_e32 v4, s8, v3
; GCN-NEXT: v_fma_f32 v3, s8, v3, -v4
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v9, v1, a[96:127]
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v9, v1, a[32:63]
; GCN-NEXT: v_rndne_f32_e32 v1, v4
; GCN-NEXT: v_sub_f32_e32 v10, v4, v1
; GCN-NEXT: v_fmac_f32_e32 v3, s8, v7
@@ -1325,16 +1325,16 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6
; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
; GCN-NEXT: v_add_u32_e32 v0, s7, v0
-; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112
+; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:112
; GCN-NEXT: s_waitcnt lgkmcnt(1)
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159]
-; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96
-; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80
-; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64
-; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48
-; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32
-; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16
-; GCN-NEXT: ds_write_b128 v0, a[0:3]
+; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:96
+; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:80
+; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:64
+; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:48
+; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:32
+; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16
+; GCN-NEXT: ds_write_b128 v0, a[96:99]
; GCN-NEXT: v_mov_b32_e32 v0, s7
; GCN-NEXT: ; kill: killed $sgpr4_sgpr5
; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0)
@@ -1347,14 +1347,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:8288
-; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:8304
-; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:8256
-; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:8272
-; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:8224
-; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:8240
-; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:8192
-; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:8208
+; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:8288
+; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:8304
+; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:8256
+; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:8272
+; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:8224
+; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:8240
+; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:8192
+; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:8208
; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480
; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496
; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448
@@ -1363,14 +1363,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432
; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384
; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400
-; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:24672
-; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:24688
-; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:24640
-; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:24656
-; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:24608
-; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:24624
-; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:24576
-; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:24592
+; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672
+; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24688
+; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:24640
+; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:24656
+; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:24608
+; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:24624
+; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:24576
+; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:24592
; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:32864
; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:32880
; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:32832
@@ -1399,14 +1399,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v5, v5
; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s6, v0
-; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112
-; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96
-; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80
-; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:64
-; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1
-; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:16
-; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:32
-; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:48
+; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:112
+; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:96
+; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:80
+; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:64
+; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1
+; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:16
+; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:32
+; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:48
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v9, 1.0
; EXACTCUTOFF-NEXT: v_ldexp_f32 v4, v4, v5
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0
@@ -1421,19 +1421,19 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; EXACTCUTOFF-NEXT: v_fma_f32 v10, s1, v3, -v10
; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v10, s1, v7
-; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v1 offset:8304
+; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1)
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v9, v4, a[0:31]
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v9, v4, a[96:127]
; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v12, v10
; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4
; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v10, v11
-; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v1 offset:8288
-; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v1 offset:8272
-; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v1 offset:8256
-; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v1 offset:8240
-; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v1 offset:8224
-; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v1 offset:8208
-; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v1 offset:8192
+; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288
+; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272
+; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256
+; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240
+; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224
+; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208
+; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192
; EXACTCUTOFF-NEXT: v_ldexp_f32 v4, v4, v10
; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v5
; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
@@ -1442,7 +1442,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; EXACTCUTOFF-NEXT: v_mul_f32_e32 v10, s2, v3
; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v11, v10
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v9, v4, a[32:63]
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v9, v4, a[0:31]
; EXACTCUTOFF-NEXT: v_fma_f32 v4, s2, v3, -v10
; EXACTCUTOFF-NEXT: v_sub_f32_e32 v12, v10, v11
; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s2, v7
@@ -1458,14 +1458,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v1 offset:24592
; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v1 offset:24576
; EXACTCUTOFF-NEXT: v_add_u32_e32 v2, 0x6000, v1
-; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:49264
-; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:49248
-; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:49232
-; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:49216
-; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:49200
-; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:49184
-; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:49168
-; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:49152
+; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v1 offset:49264
+; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v1 offset:49248
+; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v1 offset:49232
+; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v1 offset:49216
+; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v1 offset:49200
+; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v1 offset:49184
+; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v1 offset:49168
+; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v1 offset:49152
; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v4, v10
; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5
; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1491,7 +1491,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s8, v3
; EXACTCUTOFF-NEXT: v_fma_f32 v3, s8, v3, -v4
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v9, v1, a[96:127]
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v9, v1, a[32:63]
; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v1, v4
; EXACTCUTOFF-NEXT: v_sub_f32_e32 v10, v4, v1
; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v3, s8, v7
@@ -1510,16 +1510,16 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6
; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s7, v0
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:112
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1)
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159]
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:96
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:80
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:64
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:48
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3]
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:96
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:80
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:64
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:48
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:32
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:16
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99]
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s7
; EXACTCUTOFF-NEXT: ; kill: killed $sgpr4_sgpr5
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0)
@@ -1532,14 +1532,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:8288
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:8304
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:8256
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[52:55] offset:8272
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[40:43] offset:8224
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:8240
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] offset:8192
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:8208
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:8288
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:8304
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:8256
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:8272
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:8224
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:8240
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:8192
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:8208
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[88:91] offset:16480
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[92:95] offset:16496
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[80:83] offset:16448
@@ -1548,14 +1548,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[76:79] offset:16432
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[64:67] offset:16384
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[68:71] offset:16400
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:24672
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:24688
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:24640
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:24656
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:24608
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:24624
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] offset:24576
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:24592
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:24672
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:24688
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:24640
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[52:55] offset:24656
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[40:43] offset:24608
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:24624
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] offset:24576
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:24592
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:32864
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:32880
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[144:147] offset:32832
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 6b922fcd9b550..de528d7259d7b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -1235,17 +1235,17 @@ declare <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32>, <8 x i32>, <16
define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__vgpr:
; SDAG: ; %bb.0: ; %bb
-; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
-; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
-; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
+; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
+; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s2, s[4:5], 0x64
-; SDAG-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
@@ -1255,21 +1255,21 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; SDAG-NEXT: v_mov_b32_e32 v17, s13
; SDAG-NEXT: v_mov_b32_e32 v18, s14
; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: v_mov_b32_e32 v20, s16
-; SDAG-NEXT: v_mov_b32_e32 v21, s17
-; SDAG-NEXT: v_mov_b32_e32 v22, s18
-; SDAG-NEXT: v_mov_b32_e32 v23, s19
-; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v20, s0
+; SDAG-NEXT: v_mov_b32_e32 v21, s1
+; SDAG-NEXT: v_mov_b32_e32 v22, s2
+; SDAG-NEXT: v_mov_b32_e32 v23, s3
+; SDAG-NEXT: v_mov_b32_e32 v28, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__vgpr:
@@ -2509,17 +2509,17 @@ declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32>, <8 x i3
define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__vgpr:
; SDAG: ; %bb.0: ; %bb
-; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
-; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
-; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
+; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
+; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s2, s[4:5], 0x64
-; SDAG-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
@@ -2529,21 +2529,21 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: v_mov_b32_e32 v17, s13
; SDAG-NEXT: v_mov_b32_e32 v18, s14
; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: v_mov_b32_e32 v20, s16
-; SDAG-NEXT: v_mov_b32_e32 v21, s17
-; SDAG-NEXT: v_mov_b32_e32 v22, s18
-; SDAG-NEXT: v_mov_b32_e32 v23, s19
-; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v20, s0
+; SDAG-NEXT: v_mov_b32_e32 v21, s1
+; SDAG-NEXT: v_mov_b32_e32 v22, s2
+; SDAG-NEXT: v_mov_b32_e32 v23, s3
+; SDAG-NEXT: v_mov_b32_e32 v28, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__vgpr:
@@ -2923,17 +2923,17 @@ declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32>, <8 x i3
define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__vgpr:
; SDAG: ; %bb.0: ; %bb
-; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
-; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
-; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
+; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
+; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s2, s[4:5], 0x64
-; SDAG-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
@@ -2943,21 +2943,21 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: v_mov_b32_e32 v17, s13
; SDAG-NEXT: v_mov_b32_e32 v18, s14
; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: v_mov_b32_e32 v20, s16
-; SDAG-NEXT: v_mov_b32_e32 v21, s17
-; SDAG-NEXT: v_mov_b32_e32 v22, s18
-; SDAG-NEXT: v_mov_b32_e32 v23, s19
-; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v20, s0
+; SDAG-NEXT: v_mov_b32_e32 v21, s1
+; SDAG-NEXT: v_mov_b32_e32 v22, s2
+; SDAG-NEXT: v_mov_b32_e32 v23, s3
+; SDAG-NEXT: v_mov_b32_e32 v28, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__vgpr:
@@ -3337,17 +3337,17 @@ declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32>, <8 x i3
define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__vgpr:
; SDAG: ; %bb.0: ; %bb
-; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
-; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
-; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
+; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
+; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s2, s[4:5], 0x64
-; SDAG-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
@@ -3357,21 +3357,21 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: v_mov_b32_e32 v17, s13
; SDAG-NEXT: v_mov_b32_e32 v18, s14
; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: v_mov_b32_e32 v20, s16
-; SDAG-NEXT: v_mov_b32_e32 v21, s17
-; SDAG-NEXT: v_mov_b32_e32 v22, s18
-; SDAG-NEXT: v_mov_b32_e32 v23, s19
-; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v20, s0
+; SDAG-NEXT: v_mov_b32_e32 v21, s1
+; SDAG-NEXT: v_mov_b32_e32 v22, s2
+; SDAG-NEXT: v_mov_b32_e32 v23, s3
+; SDAG-NEXT: v_mov_b32_e32 v28, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__vgpr:
@@ -3751,17 +3751,17 @@ declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32>, <8 x i3
define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__vgpr:
; SDAG: ; %bb.0: ; %bb
-; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
-; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
-; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
+; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
+; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s2, s[4:5], 0x64
-; SDAG-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
@@ -3771,21 +3771,21 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: v_mov_b32_e32 v17, s13
; SDAG-NEXT: v_mov_b32_e32 v18, s14
; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: v_mov_b32_e32 v20, s16
-; SDAG-NEXT: v_mov_b32_e32 v21, s17
-; SDAG-NEXT: v_mov_b32_e32 v22, s18
-; SDAG-NEXT: v_mov_b32_e32 v23, s19
-; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v20, s0
+; SDAG-NEXT: v_mov_b32_e32 v21, s1
+; SDAG-NEXT: v_mov_b32_e32 v22, s2
+; SDAG-NEXT: v_mov_b32_e32 v23, s3
+; SDAG-NEXT: v_mov_b32_e32 v28, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__vgpr:
@@ -4156,4 +4156,4 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
ret <16 x float> %result
}
-attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
\ No newline at end of file
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index bbade6e7469f7..2b3010acbaf85 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -520,44 +520,44 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
;
; SI-SDAG-LABEL: s_exp_v2f32:
; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b
; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f
-; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; SI-SDAG-NEXT: s_mov_b32 s6, -1
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT: v_mul_f32_e32 v2, s3, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0
; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2
-; SI-SDAG-NEXT: v_fma_f32 v4, s3, v0, -v2
+; SI-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2
; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3
-; SI-SDAG-NEXT: v_fma_f32 v4, s3, v1, v4
+; SI-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4
; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
-; SI-SDAG-NEXT: v_mul_f32_e32 v5, s2, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v5, s6, v0
; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2
; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3
; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v5
-; SI-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v5
+; SI-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v5
; SI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6
-; SI-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0
+; SI-SDAG-NEXT: v_fma_f32 v0, s6, v1, v0
; SI-SDAG-NEXT: v_add_f32_e32 v0, v7, v0
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v5, v6
; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3
; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0xc2ce8ed0
-; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v3
+; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v3
; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x42b17218
; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; SI-SDAG-NEXT: v_mov_b32_e32 v6, 0x7f800000
-; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v4
+; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v4
; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v5
-; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v3
+; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v3
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v4
-; SI-SDAG-NEXT: s_mov_b32 s4, s0
-; SI-SDAG-NEXT: s_mov_b32 s5, s1
+; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v4
+; SI-SDAG-NEXT: s_mov_b32 s0, s4
+; SI-SDAG-NEXT: s_mov_b32 s1, s5
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
-; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-SDAG-NEXT: s_endpgm
;
; SI-GISEL-LABEL: s_exp_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index 81bb556b8c87b..bbf9329f8514e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -522,44 +522,44 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
;
; SI-SDAG-LABEL: s_exp10_v2f32:
; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78
; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37
-; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; SI-SDAG-NEXT: s_mov_b32 s6, -1
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT: v_mul_f32_e32 v2, s3, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0
; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2
-; SI-SDAG-NEXT: v_fma_f32 v4, s3, v0, -v2
+; SI-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2
; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3
-; SI-SDAG-NEXT: v_fma_f32 v4, s3, v1, v4
+; SI-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4
; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
-; SI-SDAG-NEXT: v_mul_f32_e32 v5, s2, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v5, s6, v0
; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2
; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3
; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v5
-; SI-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v5
+; SI-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v5
; SI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6
-; SI-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0
+; SI-SDAG-NEXT: v_fma_f32 v0, s6, v1, v0
; SI-SDAG-NEXT: v_add_f32_e32 v0, v7, v0
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v5, v6
; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3
; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0xc23369f4
-; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v3
+; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v3
; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x421a209b
; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; SI-SDAG-NEXT: v_mov_b32_e32 v6, 0x7f800000
-; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v4
+; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v4
; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v5
-; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v3
+; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v3
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v4
-; SI-SDAG-NEXT: s_mov_b32 s4, s0
-; SI-SDAG-NEXT: s_mov_b32 s5, s1
+; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v4
+; SI-SDAG-NEXT: s_mov_b32 s0, s4
+; SI-SDAG-NEXT: s_mov_b32 s1, s5
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
-; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-SDAG-NEXT: s_endpgm
;
; SI-GISEL-LABEL: s_exp10_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index b378d69fb842f..af914bd4043cf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -76,13 +76,12 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; SI-NEXT: s_movk_i32 s4, 0xfc01
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s3, 0xfffff
; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_bfe_u32 v4, v3, 20, 11
-; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v4
+; SI-NEXT: v_add_i32_e32 v6, vcc, 0xfffffc01, v4
; SI-NEXT: v_lshr_b64 v[4:5], s[2:3], v6
; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3
; SI-NEXT: v_not_b32_e32 v5, v5
@@ -575,87 +574,87 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in)
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19
; CI-NEXT: s_brev_b32 s6, -2
-; CI-NEXT: v_mov_b32_e32 v4, 0
+; CI-NEXT: v_mov_b32_e32 v12, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11]
-; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9]
+; CI-NEXT: v_trunc_f64_e32 v[4:5], s[8:9]
; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1]
-; CI-NEXT: v_add_f64 v[8:9], s[8:9], -v[6:7]
+; CI-NEXT: v_add_f64 v[6:7], s[8:9], -v[4:5]
; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5
-; CI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[8:9]|, 0.5
+; CI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[6:7]|, 0.5
; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CI-NEXT: s_cselect_b32 s7, 0x3ff00000, 0
-; CI-NEXT: v_mov_b32_e32 v5, s11
+; CI-NEXT: v_mov_b32_e32 v8, s11
; CI-NEXT: s_and_b64 s[0:1], s[2:3], exec
; CI-NEXT: v_mov_b32_e32 v2, s7
-; CI-NEXT: v_trunc_f64_e32 v[8:9], s[14:15]
-; CI-NEXT: v_bfi_b32 v5, s6, v2, v5
+; CI-NEXT: v_trunc_f64_e32 v[6:7], s[14:15]
+; CI-NEXT: v_bfi_b32 v13, s6, v2, v8
; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0
-; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5]
-; CI-NEXT: v_mov_b32_e32 v5, s0
-; CI-NEXT: v_mov_b32_e32 v10, s9
-; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[8:9]
-; CI-NEXT: v_bfi_b32 v5, s6, v5, v10
+; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[12:13]
+; CI-NEXT: v_mov_b32_e32 v8, s0
+; CI-NEXT: v_mov_b32_e32 v9, s9
+; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[6:7]
+; CI-NEXT: v_bfi_b32 v13, s6, v8, v9
; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[0:1]|, 0.5
-; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5]
-; CI-NEXT: v_trunc_f64_e32 v[6:7], s[12:13]
+; CI-NEXT: v_add_f64 v[0:1], v[4:5], v[12:13]
+; CI-NEXT: v_trunc_f64_e32 v[4:5], s[12:13]
; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; CI-NEXT: v_add_f64 v[10:11], s[12:13], -v[6:7]
+; CI-NEXT: v_add_f64 v[8:9], s[12:13], -v[4:5]
; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0
-; CI-NEXT: v_mov_b32_e32 v5, s0
-; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5
-; CI-NEXT: v_trunc_f64_e32 v[10:11], s[18:19]
-; CI-NEXT: v_mov_b32_e32 v12, s15
+; CI-NEXT: v_mov_b32_e32 v10, s0
+; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[8:9]|, 0.5
+; CI-NEXT: v_trunc_f64_e32 v[8:9], s[18:19]
+; CI-NEXT: v_mov_b32_e32 v11, s15
+; CI-NEXT: v_bfi_b32 v13, s6, v10, v11
; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; CI-NEXT: v_bfi_b32 v5, s6, v5, v12
+; CI-NEXT: v_add_f64 v[10:11], s[18:19], -v[8:9]
; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0
-; CI-NEXT: v_add_f64 v[12:13], s[18:19], -v[10:11]
-; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[4:5]
-; CI-NEXT: v_mov_b32_e32 v5, s0
+; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[12:13]
+; CI-NEXT: v_mov_b32_e32 v13, s0
; CI-NEXT: v_mov_b32_e32 v14, s13
-; CI-NEXT: v_bfi_b32 v5, s6, v5, v14
-; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[12:13]|, 0.5
+; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5
+; CI-NEXT: v_bfi_b32 v13, s6, v13, v14
; CI-NEXT: v_trunc_f64_e32 v[14:15], s[16:17]
; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; CI-NEXT: v_add_f64 v[12:13], s[16:17], -v[14:15]
; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0
-; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5]
-; CI-NEXT: v_mov_b32_e32 v5, s0
-; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[12:13]|, 0.5
+; CI-NEXT: v_add_f64 v[10:11], s[16:17], -v[14:15]
+; CI-NEXT: v_add_f64 v[4:5], v[4:5], v[12:13]
+; CI-NEXT: v_mov_b32_e32 v13, s0
; CI-NEXT: v_mov_b32_e32 v16, s19
-; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; CI-NEXT: v_bfi_b32 v5, s6, v5, v16
-; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0
+; CI-NEXT: v_bfi_b32 v13, s6, v13, v16
+; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5
; CI-NEXT: v_trunc_f64_e32 v[16:17], s[22:23]
-; CI-NEXT: v_add_f64 v[12:13], v[10:11], v[4:5]
-; CI-NEXT: v_mov_b32_e32 v5, s0
-; CI-NEXT: v_mov_b32_e32 v10, s17
-; CI-NEXT: v_bfi_b32 v5, s6, v5, v10
+; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CI-NEXT: v_add_f64 v[18:19], s[22:23], -v[16:17]
-; CI-NEXT: v_add_f64 v[10:11], v[14:15], v[4:5]
-; CI-NEXT: v_trunc_f64_e32 v[14:15], s[20:21]
+; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0
+; CI-NEXT: v_add_f64 v[10:11], v[8:9], v[12:13]
+; CI-NEXT: v_mov_b32_e32 v8, s0
+; CI-NEXT: v_mov_b32_e32 v9, s17
; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[18:19]|, 0.5
-; CI-NEXT: v_add_f64 v[18:19], s[20:21], -v[14:15]
+; CI-NEXT: v_trunc_f64_e32 v[18:19], s[20:21]
+; CI-NEXT: v_bfi_b32 v13, s6, v8, v9
+; CI-NEXT: v_add_f64 v[8:9], v[14:15], v[12:13]
+; CI-NEXT: v_add_f64 v[13:14], s[20:21], -v[18:19]
; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[18:19]|, 0.5
+; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[13:14]|, 0.5
; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; CI-NEXT: v_mov_b32_e32 v5, s2
-; CI-NEXT: v_mov_b32_e32 v18, s23
; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0
-; CI-NEXT: v_bfi_b32 v5, s6, v5, v18
-; CI-NEXT: v_mov_b32_e32 v18, s0
+; CI-NEXT: v_mov_b32_e32 v13, s2
+; CI-NEXT: v_mov_b32_e32 v14, s23
+; CI-NEXT: v_mov_b32_e32 v20, s0
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; CI-NEXT: v_mov_b32_e32 v19, s21
-; CI-NEXT: v_add_f64 v[16:17], v[16:17], v[4:5]
-; CI-NEXT: v_bfi_b32 v5, s6, v18, v19
-; CI-NEXT: v_add_f64 v[14:15], v[14:15], v[4:5]
+; CI-NEXT: v_bfi_b32 v13, s6, v13, v14
+; CI-NEXT: v_mov_b32_e32 v21, s21
+; CI-NEXT: v_add_f64 v[14:15], v[16:17], v[12:13]
+; CI-NEXT: v_bfi_b32 v13, s6, v20, v21
+; CI-NEXT: v_add_f64 v[12:13], v[18:19], v[12:13]
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48
-; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32
-; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16
+; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; CI-NEXT: s_endpgm
%result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
index 33e34e38a1837..edc67ec7ad8df 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
@@ -1676,9 +1676,8 @@ define amdgpu_kernel void @get_rounding_after_set_rounding_1() {
; GFX6-NEXT: s_add_i32 s1, s0, 4
; GFX6-NEXT: s_cmp_lt_u32 s0, 4
; GFX6-NEXT: s_cselect_b32 s4, s0, s1
-; GFX6-NEXT: s_mov_b32 s0, 0
+; GFX6-NEXT: s_mov_b64 s[0:1], 0
; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: s_mov_b32 s1, s0
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -1698,9 +1697,8 @@ define amdgpu_kernel void @get_rounding_after_set_rounding_1() {
; GFX7-NEXT: s_add_i32 s1, s0, 4
; GFX7-NEXT: s_cmp_lt_u32 s0, 4
; GFX7-NEXT: s_cselect_b32 s4, s0, s1
-; GFX7-NEXT: s_mov_b32 s0, 0
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: s_mov_b32 s1, s0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 458afa4d6aad2..1af026a48b906 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -7321,65 +7321,65 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
;
; GFX12-LABEL: constant_sextload_v16i16_to_v16i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s30, s5
-; GFX12-NEXT: s_lshr_b32 s34, s5, 16
-; GFX12-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x100000
-; GFX12-NEXT: s_lshr_b32 s4, s4, 16
-; GFX12-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x100000
-; GFX12-NEXT: s_mov_b32 s24, s7
-; GFX12-NEXT: s_lshr_b32 s26, s7, 16
+; GFX12-NEXT: s_mov_b32 s30, s9
+; GFX12-NEXT: s_lshr_b32 s34, s9, 16
+; GFX12-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x100000
+; GFX12-NEXT: s_lshr_b32 s8, s8, 16
+; GFX12-NEXT: s_bfe_i64 s[22:23], s[10:11], 0x100000
+; GFX12-NEXT: s_mov_b32 s24, s11
+; GFX12-NEXT: s_lshr_b32 s26, s11, 16
; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000
; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000
-; GFX12-NEXT: s_lshr_b32 s6, s6, 16
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
+; GFX12-NEXT: s_lshr_b32 s10, s10, 16
+; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s29
-; GFX12-NEXT: s_mov_b32 s18, s3
-; GFX12-NEXT: s_lshr_b32 s20, s3, 16
+; GFX12-NEXT: s_mov_b32 s18, s7
+; GFX12-NEXT: s_lshr_b32 s20, s7, 16
; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000
; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v9, s31
; GFX12-NEXT: v_dual_mov_b32 v8, s30 :: v_dual_mov_b32 v11, s35
-; GFX12-NEXT: v_dual_mov_b32 v10, s34 :: v_dual_mov_b32 v3, s5
-; GFX12-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x100000
-; GFX12-NEXT: s_lshr_b32 s2, s2, 16
-; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
+; GFX12-NEXT: v_dual_mov_b32 v10, s34 :: v_dual_mov_b32 v3, s9
+; GFX12-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x100000
+; GFX12-NEXT: s_lshr_b32 s6, s6, 16
+; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v5, s23
-; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v13, s25
-; GFX12-NEXT: s_mov_b32 s12, s1
-; GFX12-NEXT: s_lshr_b32 s14, s1, 16
+; GFX12-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v13, s25
+; GFX12-NEXT: s_mov_b32 s12, s5
+; GFX12-NEXT: s_lshr_b32 s14, s5, 16
; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000
; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
-; GFX12-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v7, s7
-; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000
-; GFX12-NEXT: s_lshr_b32 s0, s0, 16
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
-; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v17, s19
+; GFX12-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v7, s11
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000
+; GFX12-NEXT: s_lshr_b32 s4, s4, 16
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
+; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v17, s19
; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s21
; GFX12-NEXT: v_mov_b32_e32 v18, s20
-; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:80
-; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:64
+; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:64
; GFX12-NEXT: v_dual_mov_b32 v1, s17 :: v_dual_mov_b32 v0, s16
-; GFX12-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX12-NEXT: v_dual_mov_b32 v3, s7 :: v_dual_mov_b32 v2, s6
; GFX12-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v8, s12
; GFX12-NEXT: v_dual_mov_b32 v11, s15 :: v_dual_mov_b32 v10, s14
-; GFX12-NEXT: v_dual_mov_b32 v21, s11 :: v_dual_mov_b32 v20, s10
-; GFX12-NEXT: v_dual_mov_b32 v23, s1 :: v_dual_mov_b32 v22, s0
+; GFX12-NEXT: v_dual_mov_b32 v21, s3 :: v_dual_mov_b32 v20, s2
+; GFX12-NEXT: v_dual_mov_b32 v23, s5 :: v_dual_mov_b32 v22, s4
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:112
-; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:96
-; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:48
-; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:32
-; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:16
-; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9]
+; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:112
+; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1]
; GFX12-NEXT: s_endpgm
%load = load <16 x i16>, ptr addrspace(4) %in
%ext = sext <16 x i16> %load to <16 x i64>
@@ -8811,7 +8811,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshr_b32 s28, s2, 16
; GFX12-NEXT: s_lshr_b32 s42, s5, 16
-; GFX12-NEXT: s_lshr_b32 s52, s8, 16
+; GFX12-NEXT: s_lshr_b32 s50, s8, 16
; GFX12-NEXT: s_mov_b32 s60, s11
; GFX12-NEXT: s_lshr_b32 s22, s0, 16
; GFX12-NEXT: s_mov_b32 s24, s1
@@ -8820,18 +8820,18 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: s_lshr_b32 s36, s3, 16
; GFX12-NEXT: s_lshr_b32 s38, s4, 16
; GFX12-NEXT: s_mov_b32 s40, s5
-; GFX12-NEXT: s_lshr_b32 s46, s6, 16
-; GFX12-NEXT: s_mov_b32 s48, s7
-; GFX12-NEXT: s_lshr_b32 s50, s7, 16
-; GFX12-NEXT: s_mov_b32 s54, s9
-; GFX12-NEXT: s_lshr_b32 s56, s9, 16
-; GFX12-NEXT: s_bfe_i64 s[44:45], s[10:11], 0x100000
+; GFX12-NEXT: s_lshr_b32 s44, s6, 16
+; GFX12-NEXT: s_mov_b32 s46, s7
+; GFX12-NEXT: s_lshr_b32 s48, s7, 16
+; GFX12-NEXT: s_mov_b32 s52, s9
+; GFX12-NEXT: s_lshr_b32 s54, s9, 16
+; GFX12-NEXT: s_bfe_i64 s[56:57], s[10:11], 0x100000
; GFX12-NEXT: s_lshr_b32 s58, s10, 16
; GFX12-NEXT: s_lshr_b32 s62, s11, 16
; GFX12-NEXT: s_bfe_i64 s[10:11], s[28:29], 0x100000
; GFX12-NEXT: s_bfe_i64 s[28:29], s[42:43], 0x100000
-; GFX12-NEXT: s_bfe_i64 s[42:43], s[52:53], 0x100000
-; GFX12-NEXT: s_bfe_i64 s[52:53], s[60:61], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[42:43], s[50:51], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[50:51], s[60:61], 0x100000
; GFX12-NEXT: s_lshr_b32 s60, s14, 16
; GFX12-NEXT: s_bfe_i64 s[64:65], s[14:15], 0x100000
; GFX12-NEXT: s_mov_b32 s14, s15
@@ -8848,14 +8848,14 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: s_bfe_i64 s[22:23], s[36:37], 0x100000
; GFX12-NEXT: s_bfe_i64 s[26:27], s[38:39], 0x100000
; GFX12-NEXT: s_bfe_i64 s[30:31], s[40:41], 0x100000
-; GFX12-NEXT: s_bfe_i64 s[36:37], s[46:47], 0x100000
-; GFX12-NEXT: s_bfe_i64 s[40:41], s[48:49], 0x100000
-; GFX12-NEXT: s_bfe_i64 s[38:39], s[50:51], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[36:37], s[44:45], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[40:41], s[46:47], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[38:39], s[48:49], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[44:45], s[52:53], 0x100000
; GFX12-NEXT: s_bfe_i64 s[46:47], s[54:55], 0x100000
-; GFX12-NEXT: s_bfe_i64 s[48:49], s[56:57], 0x100000
-; GFX12-NEXT: s_bfe_i64 s[50:51], s[58:59], 0x100000
-; GFX12-NEXT: s_lshr_b32 s54, s12, 16
-; GFX12-NEXT: s_bfe_i64 s[56:57], s[12:13], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[48:49], s[58:59], 0x100000
+; GFX12-NEXT: s_lshr_b32 s52, s12, 16
+; GFX12-NEXT: s_bfe_i64 s[54:55], s[12:13], 0x100000
; GFX12-NEXT: s_mov_b32 s12, s13
; GFX12-NEXT: s_lshr_b32 s58, s13, 16
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000
@@ -8866,26 +8866,26 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v3, s67
; GFX12-NEXT: v_dual_mov_b32 v2, s66 :: v_dual_mov_b32 v5, s65
-; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v4, s64 :: v_dual_mov_b32 v7, s61
; GFX12-NEXT: v_dual_mov_b32 v6, s60 :: v_dual_mov_b32 v9, s13
; GFX12-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v11, s59
-; GFX12-NEXT: v_dual_mov_b32 v10, s58 :: v_dual_mov_b32 v13, s57
-; GFX12-NEXT: v_dual_mov_b32 v12, s56 :: v_dual_mov_b32 v15, s55
-; GFX12-NEXT: v_mov_b32_e32 v14, s54
+; GFX12-NEXT: v_dual_mov_b32 v10, s58 :: v_dual_mov_b32 v13, s55
+; GFX12-NEXT: v_dual_mov_b32 v12, s54 :: v_dual_mov_b32 v15, s53
+; GFX12-NEXT: v_mov_b32_e32 v14, s52
; GFX12-NEXT: s_bfe_i64 s[12:13], s[62:63], 0x100000
; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v24, v[0:3], s[16:17] offset:240
; GFX12-NEXT: global_store_b128 v24, v[4:7], s[16:17] offset:224
; GFX12-NEXT: global_store_b128 v24, v[8:11], s[16:17] offset:208
; GFX12-NEXT: global_store_b128 v24, v[12:15], s[16:17] offset:192
-; GFX12-NEXT: v_dual_mov_b32 v1, s53 :: v_dual_mov_b32 v0, s52
+; GFX12-NEXT: v_dual_mov_b32 v1, s51 :: v_dual_mov_b32 v0, s50
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v3, s13 :: v_dual_mov_b32 v2, s12
-; GFX12-NEXT: v_dual_mov_b32 v5, s45 :: v_dual_mov_b32 v4, s44
-; GFX12-NEXT: v_dual_mov_b32 v7, s51 :: v_dual_mov_b32 v6, s50
-; GFX12-NEXT: v_dual_mov_b32 v9, s47 :: v_dual_mov_b32 v8, s46
-; GFX12-NEXT: v_dual_mov_b32 v11, s49 :: v_dual_mov_b32 v10, s48
+; GFX12-NEXT: v_dual_mov_b32 v5, s57 :: v_dual_mov_b32 v4, s56
+; GFX12-NEXT: v_dual_mov_b32 v7, s49 :: v_dual_mov_b32 v6, s48
+; GFX12-NEXT: v_dual_mov_b32 v9, s45 :: v_dual_mov_b32 v8, s44
+; GFX12-NEXT: v_dual_mov_b32 v11, s47 :: v_dual_mov_b32 v10, s46
; GFX12-NEXT: v_dual_mov_b32 v13, s35 :: v_dual_mov_b32 v12, s34
; GFX12-NEXT: v_dual_mov_b32 v15, s43 :: v_dual_mov_b32 v14, s42
; GFX12-NEXT: v_dual_mov_b32 v17, s41 :: v_dual_mov_b32 v16, s40
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index 4ce3b46211e64..120f47a277ee6 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -846,29 +846,29 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
;
; GFX7-HSA-LABEL: constant_load_v11i32:
; GFX7-HSA: ; %bb.0: ; %entry
-; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x8
-; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
-; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16
-; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s3
-; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s2
+; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8
+; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
+; GFX7-HSA-NEXT: s_add_u32 s10, s8, 16
+; GFX7-HSA-NEXT: s_addc_u32 s11, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s10
+; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s11
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s1
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7
-; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s0
-; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32
; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s8
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3
+; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s9
+; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
+; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 573338231bd57..c5771bc73b945 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -7981,48 +7981,48 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v3
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v7
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v11
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v15
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v3
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, v7
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v11
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v15
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v22, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[24:25], v[2:3], 48
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v21, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[23:24], v[2:3], 48
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:240
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[24:25], v[0:1], 48
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v1, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[23:24], v[0:1], 48
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v1, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:208
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v12
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v26, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[24:25], v[6:7], 48
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v25, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[23:24], v[6:7], 48
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:176
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[24:25], v[4:5], 48
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v5, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[23:24], v[4:5], 48
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v5, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:144
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[24:25], v[10:11], 48
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v27, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[23:24], v[10:11], 48
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v26, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:112
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[24:25], v[8:9], 48
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v9, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[23:24], v[8:9], 48
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v9, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[23:24], v[14:15], 48
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v21, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v27, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index fe693b4af67f3..033a66abcedb9 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -1985,36 +1985,36 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
;
; GCNX3-NOHSA-LABEL: global_zextload_v8i32_to_v8i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v9, 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v11, v9
-; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
-; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
+; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v2
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v3
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48
; GCNX3-NOHSA-NEXT: s_nop 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v1
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v6
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v7
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
; GCNX3-NOHSA-NEXT: s_nop 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v4
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v5
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v8i32_to_v8i64:
@@ -2373,21 +2373,21 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
+; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
-; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
-; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
-; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
-; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
+; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
+; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
+; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
@@ -2438,30 +2438,31 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19]
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
-; GCNX3-HSA-NEXT: s_waitcnt vmcnt(4)
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v1
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v2
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v1
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v2
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v3
+; GCNX3-HSA-NEXT: s_waitcnt vmcnt(5)
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v6
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v4
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v4
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v6
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v7
; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s1
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15]
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[8:11]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s0
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v4
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v4
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v5
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v7
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v6
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v6
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v7
+; GCNX3-HSA-NEXT: s_waitcnt vmcnt(6)
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v1
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v1
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v3
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[8:11]
-; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3]
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[4:7]
; GCNX3-HSA-NEXT: s_endpgm
;
; GCNX3-NOHSA-LABEL: global_sextload_v16i32_to_v16i64:
@@ -2614,115 +2615,60 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; EG-NEXT: MOV * T16.Z, T1.Y,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
-; GCN-GFX900-HSA-LABEL: global_sextload_v16i32_to_v16i64:
-; GCN-GFX900-HSA: ; %bb.0:
-; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v36, 0
-; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v36, s[2:3] offset:32
-; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v36, s[2:3] offset:48
-; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[8:11], v36, s[2:3] offset:16
-; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[12:15], v36, s[2:3]
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(3)
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(2)
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v24, v4
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v26, v5
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v7
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v6
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v20, v6
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v22, v7
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v0
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v1
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v16, v2
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v18, v3
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(1)
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v11
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v10
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v28, v8
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v30, v9
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v0, v10
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v2, v11
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(0)
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v32, v12
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v34, v13
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, v14
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v10, v15
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[24:27], s[0:1] offset:96
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[20:23], s[0:1] offset:112
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:64
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:80
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[28:31], s[0:1] offset:32
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] offset:48
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1]
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:16
-; GCN-GFX900-HSA-NEXT: s_endpgm
-;
-; GCN-GFX908-HSA-LABEL: global_sextload_v16i32_to_v16i64:
-; GCN-GFX908-HSA: ; %bb.0:
-; GCN-GFX908-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v0, 0
-; GCN-GFX908-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] offset:32
-; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] offset:48
-; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[9:12], v0, s[2:3] offset:16
-; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[13:16], v0, s[2:3]
-; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3)
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v4
-; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(2)
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v6
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v5
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v25, v5
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v27, v6
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v3
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v8
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v7
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v21, v7
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v23, v8
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v2
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v5, v1
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v7, v2
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v17, v3
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v19, v4
-; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(1)
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v12
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v2, 31, v11
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v10
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v9
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v29, v9
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v31, v10
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v1, v11
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v3, v12
-; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(0)
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v16
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v15
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v14
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v13
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v33, v13
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v35, v14
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v9, v15
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v11, v16
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:96
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:112
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:64
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:32
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:48
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[33:36], s[0:1]
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:16
-; GCN-GFX908-HSA-NEXT: s_endpgm
+; GCN-HSA-LABEL: global_sextload_v16i32_to_v16i64:
+; GCN-HSA: ; %bb.0:
+; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: v_mov_b32_e32 v36, 0
+; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v36, s[2:3] offset:32
+; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v36, s[2:3] offset:48
+; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v36, s[2:3] offset:16
+; GCN-HSA-NEXT: global_load_dwordx4 v[12:15], v36, s[2:3]
+; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3
+; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, v4
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, v5
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v7
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v6
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, v6
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, v7
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, v0
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, v1
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, v2
+; GCN-HSA-NEXT: v_mov_b32_e32 v18, v3
+; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v11
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v10
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8
+; GCN-HSA-NEXT: v_mov_b32_e32 v28, v8
+; GCN-HSA-NEXT: v_mov_b32_e32 v30, v9
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, v10
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, v11
+; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12
+; GCN-HSA-NEXT: v_mov_b32_e32 v32, v12
+; GCN-HSA-NEXT: v_mov_b32_e32 v34, v13
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, v14
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, v15
+; GCN-HSA-NEXT: global_store_dwordx4 v36, v[24:27], s[0:1] offset:96
+; GCN-HSA-NEXT: global_store_dwordx4 v36, v[20:23], s[0:1] offset:112
+; GCN-HSA-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:64
+; GCN-HSA-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:80
+; GCN-HSA-NEXT: global_store_dwordx4 v36, v[28:31], s[0:1] offset:32
+; GCN-HSA-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] offset:48
+; GCN-HSA-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1]
+; GCN-HSA-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:16
+; GCN-HSA-NEXT: s_endpgm
%ld = load <16 x i32>, ptr addrspace(1) %in
%ext = sext <16 x i32> %ld to <16 x i64>
store <16 x i64> %ext, ptr addrspace(1) %out
@@ -3195,26 +3141,26 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 64
-; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s4
-; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[12:13]
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[8:9]
; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32
; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0
; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s7
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s6
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s6
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s1
@@ -3281,26 +3227,26 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[24:27]
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31]
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(10)
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v11
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v10
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v9
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v8
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v8
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v9
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, v10
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, v11
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v15
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v14
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v13
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v12
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v12
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v13
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, v14
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, v15
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9)
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v4
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v4
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v5
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v4
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v4
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27]
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23]
-; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[8:11]
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60
@@ -3311,58 +3257,58 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[23:26]
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12)
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v17
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v16
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v16
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v17
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s3
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s2
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s3
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v17
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v17
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCNX3-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7]
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s3
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s2
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[15:16], v[4:7]
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s3
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v19
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v18
; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v18
; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v19
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCNX3-HSA-NEXT: flat_store_dwordx4 v[16:17], v[23:26]
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[15:16], v[23:26]
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12)
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v12
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v12
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s3
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s2
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v9
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v8
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, v8
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, v9
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v13
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCNX3-HSA-NEXT: flat_store_dwordx4 v[11:12], v[16:19]
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s3
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v15
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v14
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v14
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v15
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s2
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[15:18]
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v1
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v0
-; GCNX3-HSA-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, v1
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v0
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v1
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v0
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v11
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v11
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, v1
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48
-; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14]
; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v3
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v2
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v10
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v10
; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, v3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22]
; GCNX3-HSA-NEXT: s_endpgm
;
@@ -3999,13 +3945,13 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[0:1]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s8
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s9
-; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s7
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s5
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s6
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s6
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
-; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, 0
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, v1
@@ -4090,29 +4036,29 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v19
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3]
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12)
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v8
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v9
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v12
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v13
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s3
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3]
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v10
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v11
-; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v14
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v15
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
+; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12)
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v8
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v9
; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50
-; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12)
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v12
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v13
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v14
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v15
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v10
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v11
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll
index 6f21df3a06ce7..fab5d386446d3 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll
@@ -28,28 +28,29 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1
; GCN-NEXT: ; %bb.1: ; %atomic
; GCN-NEXT: s_mov_b32 s8, s10
; GCN-NEXT: s_mov_b32 s9, s10
-; GCN-NEXT: buffer_load_dword v5, v[1:2], s[8:11], 0 addr64 offset:400
+; GCN-NEXT: buffer_load_dword v4, v[1:2], s[8:11], 0 addr64 offset:400
; GCN-NEXT: s_load_dword s4, s[4:5], 0xf
; GCN-NEXT: s_mov_b64 s[2:3], 0
; GCN-NEXT: .LBB0_2: ; %atomicrmw.start
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_max_i32_e32 v4, s4, v5
-; GCN-NEXT: v_mov_b32_e32 v3, v4
-; GCN-NEXT: v_mov_b32_e32 v4, v5
-; GCN-NEXT: buffer_atomic_cmpswap v[3:4], v[1:2], s[8:11], 0 addr64 offset:400 glc
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_max_i32_e32 v3, s4, v4
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v6, v4
+; GCN-NEXT: v_mov_b32_e32 v5, v3
+; GCN-NEXT: buffer_atomic_cmpswap v[5:6], v[1:2], s[8:11], 0 addr64 offset:400 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_wbinvl1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GCN-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GCN-NEXT: v_mov_b32_e32 v5, v3
+; GCN-NEXT: v_mov_b32_e32 v4, v5
; GCN-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN-NEXT: s_cbranch_execnz .LBB0_2
; GCN-NEXT: ; %bb.3: ; %atomicrmw.end
; GCN-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: buffer_store_dword v3, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v5, off, s[0:3], 0
; GCN-NEXT: .LBB0_4: ; %exit
; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 9b4693f61147a..f5bbf8a02e980 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -2653,38 +2653,38 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
;
; VI-LABEL: s_mul_i128:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4c
-; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x7c
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4c
+; VI-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x7c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: v_mov_b32_e32 v5, 0
-; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mad_u64_u32 v[2:3], s[12:13], s8, v0, 0
-; VI-NEXT: s_mul_i32 s3, s8, s3
-; VI-NEXT: v_mov_b32_e32 v6, s8
-; VI-NEXT: v_add_u32_e32 v3, vcc, s3, v3
-; VI-NEXT: s_mul_i32 s12, s9, s2
-; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s0, v6, 0
-; VI-NEXT: v_add_u32_e32 v3, vcc, s12, v3
+; VI-NEXT: v_mov_b32_e32 v0, s10
+; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s12, v0, 0
+; VI-NEXT: s_mul_i32 s4, s12, s11
+; VI-NEXT: v_mov_b32_e32 v6, s12
+; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s8, v6, 0
+; VI-NEXT: s_mul_i32 s6, s13, s10
+; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
; VI-NEXT: v_mov_b32_e32 v4, v1
-; VI-NEXT: v_mad_u64_u32 v[6:7], s[2:3], s1, v6, v[4:5]
-; VI-NEXT: v_mov_b32_e32 v8, s0
-; VI-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s10, v8, v[2:3]
+; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s9, v6, v[4:5]
+; VI-NEXT: v_mov_b32_e32 v8, s8
+; VI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s14, v8, v[2:3]
; VI-NEXT: v_mov_b32_e32 v4, v6
-; VI-NEXT: v_mov_b32_e32 v6, s9
-; VI-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s0, v6, v[4:5]
-; VI-NEXT: s_mul_i32 s8, s11, s0
-; VI-NEXT: v_add_u32_e32 v4, vcc, s8, v9
+; VI-NEXT: v_mov_b32_e32 v6, s13
+; VI-NEXT: v_mad_u64_u32 v[1:2], s[4:5], s8, v6, v[4:5]
+; VI-NEXT: s_mul_i32 s6, s15, s8
+; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v9
; VI-NEXT: v_add_u32_e32 v2, vcc, v7, v2
-; VI-NEXT: v_addc_u32_e64 v3, s[2:3], 0, 0, vcc
-; VI-NEXT: s_mul_i32 s8, s10, s1
-; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s1, v6, v[2:3]
-; VI-NEXT: v_add_u32_e32 v4, vcc, s8, v4
+; VI-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
+; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s9, v6, v[2:3]
+; VI-NEXT: s_mul_i32 s6, s14, s9
+; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v8
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_i128:
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 5452c80a373e6..ad3f36dc3f216 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -813,17 +813,16 @@ bb7:
define amdgpu_kernel void @test_umul_i24(ptr addrspace(1) %out, i32 %arg) {
; SI-LABEL: test_umul_i24:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dword s1, s[4:5], 0xb
+; SI-NEXT: s_load_dword s2, s[4:5], 0xb
; SI-NEXT: v_mov_b32_e32 v0, 0xff803fe1
-; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_lshr_b32 s1, s1, 9
-; SI-NEXT: v_mul_hi_u32 v0, s1, v0
-; SI-NEXT: s_mul_i32 s1, s1, 0xff803fe1
-; SI-NEXT: v_alignbit_b32 v0, v0, s1, 1
+; SI-NEXT: s_lshr_b32 s2, s2, 9
+; SI-NEXT: v_mul_hi_u32 v0, s2, v0
+; SI-NEXT: s_mul_i32 s2, s2, 0xff803fe1
+; SI-NEXT: v_alignbit_b32 v0, v0, s2, 1
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s1, s0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -836,26 +835,24 @@ define amdgpu_kernel void @test_umul_i24(ptr addrspace(1) %out, i32 %arg) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s0, s0, 9
; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s0, v0, 0
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: s_mov_b32 s1, s0
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: v_alignbit_b32 v0, v1, v0, 1
-; VI-NEXT: s_nop 1
+; VI-NEXT: s_nop 2
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul_i24:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s1, s[4:5], 0x2c
-; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s1, s1, 9
-; GFX9-NEXT: s_mul_hi_u32 s4, s1, 0xff803fe1
-; GFX9-NEXT: s_mul_i32 s1, s1, 0xff803fe1
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: s_lshr_b32 s2, s2, 9
+; GFX9-NEXT: s_mul_hi_u32 s4, s2, 0xff803fe1
+; GFX9-NEXT: s_mul_i32 s2, s2, 0xff803fe1
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1
-; GFX9-NEXT: s_mov_b32 s1, s0
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index 26a4a6743cffa..d345b57d3d08b 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -1067,29 +1067,29 @@ define amdgpu_kernel void @select_v2f16_imm_c(
;
; VI-LABEL: select_v2f16_imm_c:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s15, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0
; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
; VI-NEXT: v_mov_b32_e32 v4, 0x3900
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -1102,7 +1102,7 @@ define amdgpu_kernel void @select_v2f16_imm_c(
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_v2f16_imm_c:
@@ -1208,29 +1208,29 @@ define amdgpu_kernel void @select_v2f16_imm_d(
;
; VI-LABEL: select_v2f16_imm_d:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s15, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0
; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
; VI-NEXT: v_mov_b32_e32 v4, 0x3900
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -1243,7 +1243,7 @@ define amdgpu_kernel void @select_v2f16_imm_d(
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_v2f16_imm_d:
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index c9efeeefdf2d8..f213b9a29bfe6 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -870,28 +870,28 @@ define amdgpu_kernel void @shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in
define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: shl_v4i64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s10, s2
-; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s8, s6
-; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: s_mov_b32 s8, s2
+; SI-NEXT: s_mov_b32 s9, s3
; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0
; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
-; SI-NEXT: s_mov_b32 s0, s4
-; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v6
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshl_b64 v[9:10], v[9:10], v13
; SI-NEXT: v_lshl_b64 v[7:8], v[7:8], v11
-; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: shl_v4i64:
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
index acc193a9393c1..dc227f745aa9a 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
@@ -1922,12 +1922,12 @@ define void @v_shuffle_v2i64_v8i64__9_0(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[2:17]
+; GFX900-NEXT: ; def v[16:31]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v6, v0
-; GFX900-NEXT: v_mov_b32_e32 v7, v1
+; GFX900-NEXT: v_mov_b32_e32 v4, v16
+; GFX900-NEXT: v_mov_b32_e32 v5, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -1978,12 +1978,12 @@ define void @v_shuffle_v2i64_v8i64__10_0(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[2:17]
+; GFX900-NEXT: ; def v[16:31]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v8, v0
-; GFX900-NEXT: v_mov_b32_e32 v9, v1
+; GFX900-NEXT: v_mov_b32_e32 v6, v16
+; GFX900-NEXT: v_mov_b32_e32 v7, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2034,12 +2034,12 @@ define void @v_shuffle_v2i64_v8i64__11_0(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[2:17]
+; GFX900-NEXT: ; def v[16:31]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v10, v0
-; GFX900-NEXT: v_mov_b32_e32 v11, v1
+; GFX900-NEXT: v_mov_b32_e32 v8, v16
+; GFX900-NEXT: v_mov_b32_e32 v9, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2090,12 +2090,12 @@ define void @v_shuffle_v2i64_v8i64__12_0(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[2:17]
+; GFX900-NEXT: ; def v[16:31]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v12, v0
-; GFX900-NEXT: v_mov_b32_e32 v13, v1
+; GFX900-NEXT: v_mov_b32_e32 v10, v16
+; GFX900-NEXT: v_mov_b32_e32 v11, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2146,12 +2146,12 @@ define void @v_shuffle_v2i64_v8i64__13_0(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[2:17]
+; GFX900-NEXT: ; def v[16:31]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v14, v0
-; GFX900-NEXT: v_mov_b32_e32 v15, v1
+; GFX900-NEXT: v_mov_b32_e32 v12, v16
+; GFX900-NEXT: v_mov_b32_e32 v13, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2202,12 +2202,12 @@ define void @v_shuffle_v2i64_v8i64__14_0(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[2:17]
+; GFX900-NEXT: ; def v[16:31]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v16, v0
-; GFX900-NEXT: v_mov_b32_e32 v17, v1
+; GFX900-NEXT: v_mov_b32_e32 v14, v16
+; GFX900-NEXT: v_mov_b32_e32 v15, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2687,15 +2687,15 @@ define void @v_shuffle_v2i64_v8i64__9_1(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[14:29]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[4:19]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v8, v2
-; GFX900-NEXT: v_mov_b32_e32 v9, v3
+; GFX900-NEXT: v_mov_b32_e32 v4, v16
+; GFX900-NEXT: v_mov_b32_e32 v5, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2743,15 +2743,15 @@ define void @v_shuffle_v2i64_v8i64__10_1(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[14:29]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[4:19]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v10, v2
-; GFX900-NEXT: v_mov_b32_e32 v11, v3
+; GFX900-NEXT: v_mov_b32_e32 v6, v16
+; GFX900-NEXT: v_mov_b32_e32 v7, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2799,15 +2799,15 @@ define void @v_shuffle_v2i64_v8i64__11_1(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[14:29]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[4:19]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v12, v2
-; GFX900-NEXT: v_mov_b32_e32 v13, v3
+; GFX900-NEXT: v_mov_b32_e32 v8, v16
+; GFX900-NEXT: v_mov_b32_e32 v9, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2855,15 +2855,15 @@ define void @v_shuffle_v2i64_v8i64__12_1(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[14:29]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[4:19]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v14, v2
-; GFX900-NEXT: v_mov_b32_e32 v15, v3
+; GFX900-NEXT: v_mov_b32_e32 v10, v16
+; GFX900-NEXT: v_mov_b32_e32 v11, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2911,15 +2911,15 @@ define void @v_shuffle_v2i64_v8i64__13_1(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[14:29]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[4:19]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v16, v2
-; GFX900-NEXT: v_mov_b32_e32 v17, v3
+; GFX900-NEXT: v_mov_b32_e32 v12, v16
+; GFX900-NEXT: v_mov_b32_e32 v13, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2967,15 +2967,15 @@ define void @v_shuffle_v2i64_v8i64__14_1(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[14:29]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[4:19]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v18, v2
-; GFX900-NEXT: v_mov_b32_e32 v19, v3
+; GFX900-NEXT: v_mov_b32_e32 v14, v16
+; GFX900-NEXT: v_mov_b32_e32 v15, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[16:19], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -3455,15 +3455,15 @@ define void @v_shuffle_v2i64_v8i64__9_2(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[12:27]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[6:21]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v10, v4
-; GFX900-NEXT: v_mov_b32_e32 v11, v5
+; GFX900-NEXT: v_mov_b32_e32 v4, v16
+; GFX900-NEXT: v_mov_b32_e32 v5, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -3511,15 +3511,15 @@ define void @v_shuffle_v2i64_v8i64__10_2(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[12:27]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[6:21]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v12, v4
-; GFX900-NEXT: v_mov_b32_e32 v13, v5
+; GFX900-NEXT: v_mov_b32_e32 v6, v16
+; GFX900-NEXT: v_mov_b32_e32 v7, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -3567,15 +3567,15 @@ define void @v_shuffle_v2i64_v8i64__11_2(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[12:27]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[6:21]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v14, v4
-; GFX900-NEXT: v_mov_b32_e32 v15, v5
+; GFX900-NEXT: v_mov_b32_e32 v8, v16
+; GFX900-NEXT: v_mov_b32_e32 v9, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -3623,15 +3623,15 @@ define void @v_shuffle_v2i64_v8i64__12_2(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[12:27]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[6:21]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v16, v4
-; GFX900-NEXT: v_mov_b32_e32 v17, v5
+; GFX900-NEXT: v_mov_b32_e32 v10, v16
+; GFX900-NEXT: v_mov_b32_e32 v11, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -3679,15 +3679,15 @@ define void @v_shuffle_v2i64_v8i64__13_2(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[12:27]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[6:21]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v18, v4
-; GFX900-NEXT: v_mov_b32_e32 v19, v5
+; GFX900-NEXT: v_mov_b32_e32 v12, v16
+; GFX900-NEXT: v_mov_b32_e32 v13, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[16:19], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -3735,15 +3735,15 @@ define void @v_shuffle_v2i64_v8i64__14_2(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[12:27]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[6:21]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v20, v4
-; GFX900-NEXT: v_mov_b32_e32 v21, v5
+; GFX900-NEXT: v_mov_b32_e32 v14, v16
+; GFX900-NEXT: v_mov_b32_e32 v15, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[18:21], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -4223,15 +4223,15 @@ define void @v_shuffle_v2i64_v8i64__9_3(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[10:25]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[8:23]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v12, v6
-; GFX900-NEXT: v_mov_b32_e32 v13, v7
+; GFX900-NEXT: v_mov_b32_e32 v4, v16
+; GFX900-NEXT: v_mov_b32_e32 v5, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -4279,15 +4279,15 @@ define void @v_shuffle_v2i64_v8i64__10_3(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[10:25]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[8:23]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v14, v6
-; GFX900-NEXT: v_mov_b32_e32 v15, v7
+; GFX900-NEXT: v_mov_b32_e32 v6, v16
+; GFX900-NEXT: v_mov_b32_e32 v7, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -4335,15 +4335,15 @@ define void @v_shuffle_v2i64_v8i64__11_3(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[10:25]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[8:23]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v16, v6
-; GFX900-NEXT: v_mov_b32_e32 v17, v7
+; GFX900-NEXT: v_mov_b32_e32 v8, v16
+; GFX900-NEXT: v_mov_b32_e32 v9, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -4391,15 +4391,15 @@ define void @v_shuffle_v2i64_v8i64__12_3(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[10:25]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[8:23]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v18, v6
-; GFX900-NEXT: v_mov_b32_e32 v19, v7
+; GFX900-NEXT: v_mov_b32_e32 v10, v16
+; GFX900-NEXT: v_mov_b32_e32 v11, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[16:19], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -4447,15 +4447,15 @@ define void @v_shuffle_v2i64_v8i64__13_3(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[10:25]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[8:23]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v20, v6
-; GFX900-NEXT: v_mov_b32_e32 v21, v7
+; GFX900-NEXT: v_mov_b32_e32 v12, v16
+; GFX900-NEXT: v_mov_b32_e32 v13, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[18:21], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -4503,15 +4503,15 @@ define void @v_shuffle_v2i64_v8i64__14_3(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[10:25]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[8:23]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v22, v6
-; GFX900-NEXT: v_mov_b32_e32 v23, v7
+; GFX900-NEXT: v_mov_b32_e32 v14, v16
+; GFX900-NEXT: v_mov_b32_e32 v15, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[20:23], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -4991,15 +4991,15 @@ define void @v_shuffle_v2i64_v8i64__9_4(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[8:23]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[10:25]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v14, v8
-; GFX900-NEXT: v_mov_b32_e32 v15, v9
+; GFX900-NEXT: v_mov_b32_e32 v4, v16
+; GFX900-NEXT: v_mov_b32_e32 v5, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -5047,15 +5047,15 @@ define void @v_shuffle_v2i64_v8i64__10_4(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[8:23]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[10:25]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v16, v8
-; GFX900-NEXT: v_mov_b32_e32 v17, v9
+; GFX900-NEXT: v_mov_b32_e32 v6, v16
+; GFX900-NEXT: v_mov_b32_e32 v7, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -5103,15 +5103,15 @@ define void @v_shuffle_v2i64_v8i64__11_4(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[8:23]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[10:25]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v18, v8
-; GFX900-NEXT: v_mov_b32_e32 v19, v9
+; GFX900-NEXT: v_mov_b32_e32 v8, v16
+; GFX900-NEXT: v_mov_b32_e32 v9, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[16:19], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -5159,15 +5159,15 @@ define void @v_shuffle_v2i64_v8i64__12_4(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[8:23]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[10:25]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v20, v8
-; GFX900-NEXT: v_mov_b32_e32 v21, v9
+; GFX900-NEXT: v_mov_b32_e32 v10, v16
+; GFX900-NEXT: v_mov_b32_e32 v11, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[18:21], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -5215,15 +5215,15 @@ define void @v_shuffle_v2i64_v8i64__13_4(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[8:23]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[10:25]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v22, v8
-; GFX900-NEXT: v_mov_b32_e32 v23, v9
+; GFX900-NEXT: v_mov_b32_e32 v12, v16
+; GFX900-NEXT: v_mov_b32_e32 v13, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[20:23], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -5271,15 +5271,15 @@ define void @v_shuffle_v2i64_v8i64__14_4(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[8:23]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[10:25]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v24, v8
-; GFX900-NEXT: v_mov_b32_e32 v25, v9
+; GFX900-NEXT: v_mov_b32_e32 v14, v16
+; GFX900-NEXT: v_mov_b32_e32 v15, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[22:25], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -5759,15 +5759,15 @@ define void @v_shuffle_v2i64_v8i64__9_5(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[6:21]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[12:27]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v16, v10
-; GFX900-NEXT: v_mov_b32_e32 v17, v11
+; GFX900-NEXT: v_mov_b32_e32 v4, v16
+; GFX900-NEXT: v_mov_b32_e32 v5, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -5815,15 +5815,15 @@ define void @v_shuffle_v2i64_v8i64__10_5(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[6:21]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[12:27]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v18, v10
-; GFX900-NEXT: v_mov_b32_e32 v19, v11
+; GFX900-NEXT: v_mov_b32_e32 v6, v16
+; GFX900-NEXT: v_mov_b32_e32 v7, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[16:19], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -5871,15 +5871,15 @@ define void @v_shuffle_v2i64_v8i64__11_5(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[6:21]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[12:27]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v20, v10
-; GFX900-NEXT: v_mov_b32_e32 v21, v11
+; GFX900-NEXT: v_mov_b32_e32 v8, v16
+; GFX900-NEXT: v_mov_b32_e32 v9, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[18:21], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -5927,15 +5927,15 @@ define void @v_shuffle_v2i64_v8i64__12_5(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[6:21]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[12:27]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v22, v10
-; GFX900-NEXT: v_mov_b32_e32 v23, v11
+; GFX900-NEXT: v_mov_b32_e32 v10, v16
+; GFX900-NEXT: v_mov_b32_e32 v11, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[20:23], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -5983,15 +5983,15 @@ define void @v_shuffle_v2i64_v8i64__13_5(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[6:21]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[12:27]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v24, v10
-; GFX900-NEXT: v_mov_b32_e32 v25, v11
+; GFX900-NEXT: v_mov_b32_e32 v12, v16
+; GFX900-NEXT: v_mov_b32_e32 v13, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[22:25], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -6039,15 +6039,15 @@ define void @v_shuffle_v2i64_v8i64__14_5(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[6:21]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[12:27]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v26, v10
-; GFX900-NEXT: v_mov_b32_e32 v27, v11
+; GFX900-NEXT: v_mov_b32_e32 v14, v16
+; GFX900-NEXT: v_mov_b32_e32 v15, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[24:27], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -6527,15 +6527,15 @@ define void @v_shuffle_v2i64_v8i64__9_6(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[4:19]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[14:29]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v18, v12
-; GFX900-NEXT: v_mov_b32_e32 v19, v13
+; GFX900-NEXT: v_mov_b32_e32 v4, v16
+; GFX900-NEXT: v_mov_b32_e32 v5, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[16:19], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -6583,15 +6583,15 @@ define void @v_shuffle_v2i64_v8i64__10_6(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[4:19]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[14:29]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v20, v12
-; GFX900-NEXT: v_mov_b32_e32 v21, v13
+; GFX900-NEXT: v_mov_b32_e32 v6, v16
+; GFX900-NEXT: v_mov_b32_e32 v7, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[18:21], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -6639,15 +6639,15 @@ define void @v_shuffle_v2i64_v8i64__11_6(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[4:19]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[14:29]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v22, v12
-; GFX900-NEXT: v_mov_b32_e32 v23, v13
+; GFX900-NEXT: v_mov_b32_e32 v8, v16
+; GFX900-NEXT: v_mov_b32_e32 v9, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[20:23], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -6695,15 +6695,15 @@ define void @v_shuffle_v2i64_v8i64__12_6(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[4:19]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[14:29]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v24, v12
-; GFX900-NEXT: v_mov_b32_e32 v25, v13
+; GFX900-NEXT: v_mov_b32_e32 v10, v16
+; GFX900-NEXT: v_mov_b32_e32 v11, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[22:25], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -6751,15 +6751,15 @@ define void @v_shuffle_v2i64_v8i64__13_6(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[4:19]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[14:29]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v26, v12
-; GFX900-NEXT: v_mov_b32_e32 v27, v13
+; GFX900-NEXT: v_mov_b32_e32 v12, v16
+; GFX900-NEXT: v_mov_b32_e32 v13, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[24:27], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -6807,15 +6807,15 @@ define void @v_shuffle_v2i64_v8i64__14_6(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[4:19]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[14:29]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v28, v12
-; GFX900-NEXT: v_mov_b32_e32 v29, v13
+; GFX900-NEXT: v_mov_b32_e32 v14, v16
+; GFX900-NEXT: v_mov_b32_e32 v15, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[26:29], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -7295,15 +7295,15 @@ define void @v_shuffle_v2i64_v8i64__9_7(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[2:17]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[16:31]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v20, v14
-; GFX900-NEXT: v_mov_b32_e32 v21, v15
+; GFX900-NEXT: v_mov_b32_e32 v4, v16
+; GFX900-NEXT: v_mov_b32_e32 v5, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[18:21], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -7351,15 +7351,15 @@ define void @v_shuffle_v2i64_v8i64__10_7(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[2:17]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[16:31]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v22, v14
-; GFX900-NEXT: v_mov_b32_e32 v23, v15
+; GFX900-NEXT: v_mov_b32_e32 v6, v16
+; GFX900-NEXT: v_mov_b32_e32 v7, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[20:23], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -7407,15 +7407,15 @@ define void @v_shuffle_v2i64_v8i64__11_7(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[2:17]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[16:31]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v24, v14
-; GFX900-NEXT: v_mov_b32_e32 v25, v15
+; GFX900-NEXT: v_mov_b32_e32 v8, v16
+; GFX900-NEXT: v_mov_b32_e32 v9, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[22:25], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -7463,15 +7463,15 @@ define void @v_shuffle_v2i64_v8i64__12_7(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[2:17]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[16:31]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v26, v14
-; GFX900-NEXT: v_mov_b32_e32 v27, v15
+; GFX900-NEXT: v_mov_b32_e32 v10, v16
+; GFX900-NEXT: v_mov_b32_e32 v11, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[24:27], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -7519,15 +7519,15 @@ define void @v_shuffle_v2i64_v8i64__13_7(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[2:17]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[16:31]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v28, v14
-; GFX900-NEXT: v_mov_b32_e32 v29, v15
+; GFX900-NEXT: v_mov_b32_e32 v12, v16
+; GFX900-NEXT: v_mov_b32_e32 v13, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[26:29], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -7575,15 +7575,15 @@ define void @v_shuffle_v2i64_v8i64__14_7(ptr addrspace(1) inreg %ptr) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[0:15]
+; GFX900-NEXT: ; def v[2:17]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def v[16:31]
+; GFX900-NEXT: ; def v[0:15]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v30, v14
-; GFX900-NEXT: v_mov_b32_e32 v31, v15
+; GFX900-NEXT: v_mov_b32_e32 v14, v16
+; GFX900-NEXT: v_mov_b32_e32 v15, v17
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: global_store_dwordx4 v0, v[28:31], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -8695,15 +8695,15 @@ define void @v_shuffle_v2i64_v8i64__7_9(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[16:31]
+; GFX90A-NEXT: ; def v[2:17]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v32, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v16, v14
-; GFX90A-NEXT: v_mov_b32_e32 v17, v15
-; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v18, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v16
+; GFX90A-NEXT: v_mov_b32_e32 v1, v17
+; GFX90A-NEXT: global_store_dwordx4 v18, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -8711,16 +8711,16 @@ define void @v_shuffle_v2i64_v8i64__7_9(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[16:31]
+; GFX942-NEXT: ; def v[2:17]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v32, 0
+; GFX942-NEXT: v_mov_b32_e32 v18, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:15]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v16, v14
-; GFX942-NEXT: v_mov_b32_e32 v17, v15
-; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v16
+; GFX942-NEXT: v_mov_b32_e32 v1, v17
+; GFX942-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x i64> asm "; def $0", "=v"()
@@ -9499,15 +9499,15 @@ define void @v_shuffle_v2i64_v8i64__7_10(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[16:31]
+; GFX90A-NEXT: ; def v[2:17]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v32, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v18, v14
-; GFX90A-NEXT: v_mov_b32_e32 v19, v15
-; GFX90A-NEXT: global_store_dwordx4 v32, v[18:21], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v18, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v16
+; GFX90A-NEXT: v_mov_b32_e32 v3, v17
+; GFX90A-NEXT: global_store_dwordx4 v18, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -9515,16 +9515,16 @@ define void @v_shuffle_v2i64_v8i64__7_10(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[16:31]
+; GFX942-NEXT: ; def v[2:17]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v32, 0
+; GFX942-NEXT: v_mov_b32_e32 v18, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:15]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v18, v14
-; GFX942-NEXT: v_mov_b32_e32 v19, v15
-; GFX942-NEXT: global_store_dwordx4 v32, v[18:21], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v16
+; GFX942-NEXT: v_mov_b32_e32 v3, v17
+; GFX942-NEXT: global_store_dwordx4 v18, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x i64> asm "; def $0", "=v"()
@@ -10303,15 +10303,15 @@ define void @v_shuffle_v2i64_v8i64__7_11(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[16:31]
+; GFX90A-NEXT: ; def v[2:17]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v32, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v20, v14
-; GFX90A-NEXT: v_mov_b32_e32 v21, v15
-; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v18, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v16
+; GFX90A-NEXT: v_mov_b32_e32 v5, v17
+; GFX90A-NEXT: global_store_dwordx4 v18, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -10319,16 +10319,16 @@ define void @v_shuffle_v2i64_v8i64__7_11(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[16:31]
+; GFX942-NEXT: ; def v[2:17]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v32, 0
+; GFX942-NEXT: v_mov_b32_e32 v18, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:15]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v20, v14
-; GFX942-NEXT: v_mov_b32_e32 v21, v15
-; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v16
+; GFX942-NEXT: v_mov_b32_e32 v5, v17
+; GFX942-NEXT: global_store_dwordx4 v18, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x i64> asm "; def $0", "=v"()
@@ -11107,15 +11107,15 @@ define void @v_shuffle_v2i64_v8i64__7_12(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[16:31]
+; GFX90A-NEXT: ; def v[2:17]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v32, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v22, v14
-; GFX90A-NEXT: v_mov_b32_e32 v23, v15
-; GFX90A-NEXT: global_store_dwordx4 v32, v[22:25], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v18, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v16
+; GFX90A-NEXT: v_mov_b32_e32 v7, v17
+; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -11123,16 +11123,16 @@ define void @v_shuffle_v2i64_v8i64__7_12(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[16:31]
+; GFX942-NEXT: ; def v[2:17]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v32, 0
+; GFX942-NEXT: v_mov_b32_e32 v18, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:15]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v22, v14
-; GFX942-NEXT: v_mov_b32_e32 v23, v15
-; GFX942-NEXT: global_store_dwordx4 v32, v[22:25], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v16
+; GFX942-NEXT: v_mov_b32_e32 v7, v17
+; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x i64> asm "; def $0", "=v"()
@@ -11911,15 +11911,15 @@ define void @v_shuffle_v2i64_v8i64__7_13(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[16:31]
+; GFX90A-NEXT: ; def v[2:17]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v32, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v24, v14
-; GFX90A-NEXT: v_mov_b32_e32 v25, v15
-; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v18, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v16
+; GFX90A-NEXT: v_mov_b32_e32 v9, v17
+; GFX90A-NEXT: global_store_dwordx4 v18, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -11927,16 +11927,16 @@ define void @v_shuffle_v2i64_v8i64__7_13(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[16:31]
+; GFX942-NEXT: ; def v[2:17]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v32, 0
+; GFX942-NEXT: v_mov_b32_e32 v18, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:15]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v24, v14
-; GFX942-NEXT: v_mov_b32_e32 v25, v15
-; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v16
+; GFX942-NEXT: v_mov_b32_e32 v9, v17
+; GFX942-NEXT: global_store_dwordx4 v18, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x i64> asm "; def $0", "=v"()
@@ -12715,15 +12715,15 @@ define void @v_shuffle_v2i64_v8i64__7_14(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[16:31]
+; GFX90A-NEXT: ; def v[2:17]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v32, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v26, v14
-; GFX90A-NEXT: v_mov_b32_e32 v27, v15
-; GFX90A-NEXT: global_store_dwordx4 v32, v[26:29], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v18, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, v16
+; GFX90A-NEXT: v_mov_b32_e32 v11, v17
+; GFX90A-NEXT: global_store_dwordx4 v18, v[10:13], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -12731,16 +12731,16 @@ define void @v_shuffle_v2i64_v8i64__7_14(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[16:31]
+; GFX942-NEXT: ; def v[2:17]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v32, 0
+; GFX942-NEXT: v_mov_b32_e32 v18, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:15]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v26, v14
-; GFX942-NEXT: v_mov_b32_e32 v27, v15
-; GFX942-NEXT: global_store_dwordx4 v32, v[26:29], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v16
+; GFX942-NEXT: v_mov_b32_e32 v11, v17
+; GFX942-NEXT: global_store_dwordx4 v18, v[10:13], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x i64> asm "; def $0", "=v"()
@@ -13519,15 +13519,15 @@ define void @v_shuffle_v2i64_v8i64__7_15(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[16:31]
+; GFX90A-NEXT: ; def v[2:17]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v32, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v28, v14
-; GFX90A-NEXT: v_mov_b32_e32 v29, v15
-; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v18, 0
+; GFX90A-NEXT: v_mov_b32_e32 v12, v16
+; GFX90A-NEXT: v_mov_b32_e32 v13, v17
+; GFX90A-NEXT: global_store_dwordx4 v18, v[12:15], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -13535,16 +13535,16 @@ define void @v_shuffle_v2i64_v8i64__7_15(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[16:31]
+; GFX942-NEXT: ; def v[2:17]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v32, 0
+; GFX942-NEXT: v_mov_b32_e32 v18, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:15]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v28, v14
-; GFX942-NEXT: v_mov_b32_e32 v29, v15
-; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v12, v16
+; GFX942-NEXT: v_mov_b32_e32 v13, v17
+; GFX942-NEXT: global_store_dwordx4 v18, v[12:15], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x i64> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 3a872a6080952..a6117578b399b 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -4672,15 +4672,14 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 13, v0
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x80
+; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, s2, v5
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s3
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v0, vcc
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x80, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13
; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
; GFX9-FLATSCR-NEXT: s_mov_b32 s4, 4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -4710,12 +4709,11 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v2
; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x74
+; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x100
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
-; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -4745,12 +4743,11 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x180, v2
; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf4
+; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x180
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
-; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x104
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -4780,12 +4777,11 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x200, v2
; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x174
+; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x200
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
-; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x184
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -4815,12 +4811,11 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x280, v2
; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1f4
+; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x280
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
-; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x204
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -4850,12 +4845,11 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x300, v2
; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x274
+; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x300
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
-; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x284
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -4885,12 +4879,11 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x380, v2
; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2f4
+; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x380
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
-; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x304
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -4920,12 +4913,11 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x400, v2
; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x374
+; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x400
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
-; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x384
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -9779,7 +9771,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 8, v0
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:240
-; GFX6-NEXT: s_mov_b32 s2, 0x86600
+; GFX6-NEXT: s_mov_b32 s2, 0x86a00
; GFX6-NEXT: s_mov_b64 s[8:9], exec
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -9788,7 +9780,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:224
-; GFX6-NEXT: s_mov_b32 s2, 0x86200
+; GFX6-NEXT: s_mov_b32 s2, 0x86600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
@@ -9796,7 +9788,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:208
-; GFX6-NEXT: s_mov_b32 s2, 0x85e00
+; GFX6-NEXT: s_mov_b32 s2, 0x86200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
@@ -9804,7 +9796,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:192
-; GFX6-NEXT: s_mov_b32 s2, 0x85a00
+; GFX6-NEXT: s_mov_b32 s2, 0x85e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
@@ -9812,7 +9804,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:176
-; GFX6-NEXT: s_mov_b32 s2, 0x85600
+; GFX6-NEXT: s_mov_b32 s2, 0x85a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
@@ -9820,7 +9812,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:160
-; GFX6-NEXT: s_mov_b32 s2, 0x85200
+; GFX6-NEXT: s_mov_b32 s2, 0x85600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
@@ -9828,7 +9820,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:144
-; GFX6-NEXT: s_mov_b32 s2, 0x84e00
+; GFX6-NEXT: s_mov_b32 s2, 0x85200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
@@ -9836,7 +9828,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:128
-; GFX6-NEXT: s_mov_b32 s2, 0x84a00
+; GFX6-NEXT: s_mov_b32 s2, 0x84e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
@@ -9844,7 +9836,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:112
-; GFX6-NEXT: s_mov_b32 s2, 0x84600
+; GFX6-NEXT: s_mov_b32 s2, 0x84a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
@@ -9852,7 +9844,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:96
-; GFX6-NEXT: s_mov_b32 s2, 0x84200
+; GFX6-NEXT: s_mov_b32 s2, 0x84600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
@@ -9860,7 +9852,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:80
-; GFX6-NEXT: s_mov_b32 s2, 0x83e00
+; GFX6-NEXT: s_mov_b32 s2, 0x84200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
@@ -9906,7 +9898,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[8:9]
; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:48
-; GFX6-NEXT: s_mov_b32 s0, 0x86a00
+; GFX6-NEXT: s_mov_b32 s0, 0x83e00
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10187,13 +10179,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[4:5]
-; GFX6-NEXT: s_mov_b32 s0, 0x86600
+; GFX6-NEXT: s_mov_b32 s0, 0x86a00
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX6-NEXT: s_mov_b32 s0, 0x86200
+; GFX6-NEXT: s_mov_b32 s0, 0x86600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:240
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10201,7 +10193,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x85e00
+; GFX6-NEXT: s_mov_b32 s0, 0x86200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:224
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10209,7 +10201,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x85a00
+; GFX6-NEXT: s_mov_b32 s0, 0x85e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:208
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10217,7 +10209,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x85600
+; GFX6-NEXT: s_mov_b32 s0, 0x85a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:192
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10225,7 +10217,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x85200
+; GFX6-NEXT: s_mov_b32 s0, 0x85600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:176
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10233,7 +10225,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x84e00
+; GFX6-NEXT: s_mov_b32 s0, 0x85200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:160
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10241,7 +10233,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x84a00
+; GFX6-NEXT: s_mov_b32 s0, 0x84e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:144
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10249,7 +10241,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x84600
+; GFX6-NEXT: s_mov_b32 s0, 0x84a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:128
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10257,7 +10249,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x84200
+; GFX6-NEXT: s_mov_b32 s0, 0x84600
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:112
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10265,7 +10257,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x83e00
+; GFX6-NEXT: s_mov_b32 s0, 0x84200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:96
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10281,7 +10273,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x86a00
+; GFX6-NEXT: s_mov_b32 s0, 0x83e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:64
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10324,59 +10316,58 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 16
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:224
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:208
-; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2030
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:224
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:192
-; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:208
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2030
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[19:22], v5, s[38:39] offset:192
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[15:18], v5, s[38:39] offset:176
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:160
-; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:144
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:144
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:128
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:112
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[19:22], v5, s[38:39]
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:112
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2)
-; GFX9-FLATSCR-NEXT: v_lshl_add_u32 v4, v19, 13, v4
-; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:96
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:96
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:80
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:80
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:64
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:64
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2090
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:48
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:48
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:32
-; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:32
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:16
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_nop 0
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[7:10], v5, s[38:39]
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 1
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: v_lshl_add_u32 v4, v7, 13, v4
+; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
; GFX9-FLATSCR-NEXT: scratch_store_dword v4, v6, off
; GFX9-FLATSCR-NEXT: ;;#ASMSTART
; GFX9-FLATSCR-NEXT: ; def s[0:7]
@@ -10406,26 +10397,26 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39]
; GFX9-FLATSCR-NEXT: ;;#ASMEND
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20e0
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20e0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20f0
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v19
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v20
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v21
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v22
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v11
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v13
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v14
; GFX9-FLATSCR-NEXT: ;;#ASMSTART
; GFX9-FLATSCR-NEXT: ;;#ASMEND
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20e0
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v3
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[15:18], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v2
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v1
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v0
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[15:18], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, v1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, v0
; GFX9-FLATSCR-NEXT: ;;#ASMSTART
; GFX9-FLATSCR-NEXT: ;;#ASMEND
; GFX9-FLATSCR-NEXT: ;;#ASMSTART
@@ -10441,53 +10432,53 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: .LBB1_2: ; %ret
; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:112
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:112
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:96
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:96
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2090
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:80
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:80
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:64
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:48
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:64
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:32
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:48
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[11:14], s[36:37] offset:32
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:16
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[19:22], s[36:37]
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[11:14], s[36:37] offset:16
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[7:10], s[36:37]
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:240
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2030
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[19:22], s[36:37] offset:240
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[11:14], s[36:37] offset:224
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:224
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[11:14], s[36:37] offset:208
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[11:14], s[36:37] offset:192
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:208
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[19:22], s[36:37] offset:192
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[15:18], s[36:37] offset:176
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:160
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[11:14], s[36:37] offset:160
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:144
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll
index 57496c2be54be..d38238f1f63a9 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll
@@ -85,13 +85,13 @@ define amdgpu_kernel void @max_10_vgprs_spill_v32(ptr addrspace(1) %p) #0 {
; GFX908-DAG: v_accvgpr_read_b32
; GFX900: NumVgprs: 256
-; GFX900: ScratchSize: 148
-; GFX908: NumVgprs: 254
+; GFX900: ScratchSize: 132
+; GFX908: NumVgprs: 252
; GFX908: ScratchSize: 0
; GFX900: VGPRBlocks: 63
-; GFX908: VGPRBlocks: 63
+; GFX908: VGPRBlocks: 62
; GFX900: NumVGPRsForWavesPerEU: 256
-; GFX908: NumVGPRsForWavesPerEU: 254
+; GFX908: NumVGPRsForWavesPerEU: 252
define amdgpu_kernel void @max_256_vgprs_spill_9x32(ptr addrspace(1) %p) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%p1 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p, i32 %tid
@@ -136,13 +136,13 @@ define amdgpu_kernel void @max_256_vgprs_spill_9x32(ptr addrspace(1) %p) #1 {
; GFX908-DAG: v_accvgpr_read_b32
; GFX900: NumVgprs: 256
-; GFX908: NumVgprs: 252
+; GFX908: NumVgprs: 254
; GFX900: ScratchSize: 132
; GFX908: ScratchSize: 0
; GFX900: VGPRBlocks: 63
-; GFX908: VGPRBlocks: 62
+; GFX908: VGPRBlocks: 63
; GFX900: NumVGPRsForWavesPerEU: 256
-; GFX908: NumVGPRsForWavesPerEU: 252
+; GFX908: NumVGPRsForWavesPerEU: 254
define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(ptr addrspace(1) %p) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%p1 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p, i32 %tid
@@ -182,9 +182,9 @@ st:
; objects and are processing VGPR spills
; GCN-LABEL: {{^}}stack_args_vgpr_spill:
-; GFX908: v_accvgpr_write_b32
; GFX908: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32
; GFX908: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4
+; GFX908: v_accvgpr_write_b32
define void @stack_args_vgpr_spill(<32 x float> %arg0, <32 x float> %arg1, ptr addrspace(1) %p) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%p1 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index 386a046113964..67c51286de216 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -558,28 +558,28 @@ define amdgpu_kernel void @ashr_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: ashr_v4i64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s10, s2
-; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s8, s6
-; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: s_mov_b32 s8, s2
+; SI-NEXT: s_mov_b32 s9, s3
; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0
; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
-; SI-NEXT: s_mov_b32 s0, s4
-; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v6
; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v4
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ashr_i64 v[9:10], v[9:10], v13
; SI-NEXT: v_ashr_i64 v[7:8], v[7:8], v11
-; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: ashr_v4i64:
diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll
index a9b1f7e888567..badb1f6fe9847 100644
--- a/llvm/test/CodeGen/AMDGPU/srl.ll
+++ b/llvm/test/CodeGen/AMDGPU/srl.ll
@@ -258,28 +258,28 @@ define amdgpu_kernel void @lshr_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: lshr_v4i64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s10, s2
-; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s8, s6
-; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: s_mov_b32 s8, s2
+; SI-NEXT: s_mov_b32 s9, s3
; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0
; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
-; SI-NEXT: s_mov_b32 s0, s4
-; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v6
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], v4
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], v13
; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], v11
-; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: lshr_v4i64:
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 0d682a6627a1a..95fcc45fe2458 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -2319,10 +2319,9 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readonly %arg) {
; SI-LABEL: fdiv_test_denormals:
; SI: ; %bb.0: ; %bb
-; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s1, s0
; SI-NEXT: buffer_load_sbyte v0, off, s[0:3], 0
; SI-NEXT: buffer_load_sbyte v1, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(1)
@@ -2345,10 +2344,9 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon
;
; VI-LABEL: fdiv_test_denormals:
; VI: ; %bb.0: ; %bb
-; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_mov_b32 s1, s0
; VI-NEXT: buffer_load_sbyte v0, off, s[0:3], 0
; VI-NEXT: buffer_load_sbyte v1, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
index 8e210b8896954..2d5e1bb483421 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -549,32 +549,32 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 x i64> %in) #0{
; GFX6-LABEL: s_uint_to_fp_v2i64_to_v2f16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd
-; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
+; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_flbit_i32_b32 s4, s11
-; GFX6-NEXT: s_flbit_i32_b32 s5, s9
-; GFX6-NEXT: s_min_u32 s6, s4, 32
-; GFX6-NEXT: s_min_u32 s12, s5, 32
-; GFX6-NEXT: s_lshl_b64 s[4:5], s[10:11], s6
-; GFX6-NEXT: s_sub_i32 s10, 32, s6
-; GFX6-NEXT: s_lshl_b64 s[6:7], s[8:9], s12
-; GFX6-NEXT: s_sub_i32 s8, 32, s12
-; GFX6-NEXT: s_min_u32 s4, s4, 1
-; GFX6-NEXT: s_min_u32 s6, s6, 1
-; GFX6-NEXT: s_or_b32 s4, s5, s4
-; GFX6-NEXT: s_or_b32 s5, s7, s6
-; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4
-; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s5
-; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s10
-; GFX6-NEXT: v_ldexp_f32_e64 v1, v1, s8
+; GFX6-NEXT: s_flbit_i32_b32 s8, s3
+; GFX6-NEXT: s_flbit_i32_b32 s9, s1
+; GFX6-NEXT: s_min_u32 s8, s8, 32
+; GFX6-NEXT: s_min_u32 s9, s9, 32
+; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s8
+; GFX6-NEXT: s_sub_i32 s8, 32, s8
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s9
+; GFX6-NEXT: s_sub_i32 s9, 32, s9
+; GFX6-NEXT: s_min_u32 s2, s2, 1
+; GFX6-NEXT: s_min_u32 s0, s0, 1
+; GFX6-NEXT: s_or_b32 s2, s3, s2
+; GFX6-NEXT: s_or_b32 s0, s1, s0
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0
+; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s8
+; GFX6-NEXT: v_ldexp_f32_e64 v1, v1, s9
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index 3a49c9b23f59e..64caf95b48029 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -574,16 +574,8 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe
; SI-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 3, killed [[COPY1]](s32), implicit $exec
; SI-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR killed %11, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s64) from %ir.idx, addrspace 1)
; SI-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[GLOBAL_LOAD_DWORDX2_SADDR]], 16, 0, implicit $exec :: (invariant load (s128) from %ir.3 + 16, addrspace 4)
- ; SI-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub3
- ; SI-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub2
- ; SI-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub1
- ; SI-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX4_]].sub0
; SI-NEXT: [[GLOBAL_LOAD_DWORDX4_1:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[GLOBAL_LOAD_DWORDX2_SADDR]], 0, 0, implicit $exec :: (invariant load (s128) from %ir.3, align 32, addrspace 4)
- ; SI-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_1]].sub3
- ; SI-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_1]].sub2
- ; SI-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_1]].sub1
- ; SI-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX4_1]].sub0
- ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_256 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3, killed [[COPY5]], %subreg.sub4, killed [[COPY4]], %subreg.sub5, killed [[COPY3]], %subreg.sub6, killed [[COPY2]], %subreg.sub7
+ ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_256 = REG_SEQUENCE killed [[GLOBAL_LOAD_DWORDX4_1]].sub0, %subreg.sub0, [[GLOBAL_LOAD_DWORDX4_1]].sub1, %subreg.sub1, [[GLOBAL_LOAD_DWORDX4_1]].sub2, %subreg.sub2, [[GLOBAL_LOAD_DWORDX4_1]].sub3, %subreg.sub3, killed [[GLOBAL_LOAD_DWORDX4_]].sub0, %subreg.sub4, [[GLOBAL_LOAD_DWORDX4_]].sub1, %subreg.sub5, [[GLOBAL_LOAD_DWORDX4_]].sub2, %subreg.sub6, [[GLOBAL_LOAD_DWORDX4_]].sub3, %subreg.sub7
; SI-NEXT: [[GLOBAL_LOAD_DWORDX4_2:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 killed [[GLOBAL_LOAD_DWORDX2_SADDR]], 48, 0, implicit $exec :: (invariant load (s128) from %ir.add.ptr.i, addrspace 4)
; SI-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; SI-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index b5e4bcd049c42..0d19f4fca8880 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -206,19 +206,19 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906-NEXT: v_mov_b32_e32 v9, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[0:1] offset:16
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[0:1]
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[0:1] offset:16
+; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[0:1]
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB5_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[2:3] offset:16
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[2:3]
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[2:3] offset:16
+; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[2:3]
; GFX906-NEXT: .LBB5_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[6:7] offset:16
+; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[6:7] offset:16
; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[6:7]
+; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[6:7]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -246,7 +246,10 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX906-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[0:1] offset:240
+; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[0:1] offset:240
+; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[0:1] offset:224
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[0:1] offset:208
+; GFX906-NEXT: global_load_dwordx4 v[17:20], v61, s[0:1] offset:192
; GFX906-NEXT: s_mov_b32 s14, -1
; GFX906-NEXT: s_mov_b32 s15, 0xe00000
; GFX906-NEXT: s_add_u32 s12, s12, s11
@@ -254,84 +257,81 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: v_mov_b32_e32 v4, 0
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: buffer_store_dword v5, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 ; 4-byte Folded Spill
; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[0:1] offset:224
+; GFX906-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[29:32], v61, s[0:1] offset:176
+; GFX906-NEXT: global_load_dwordx4 v[25:28], v61, s[0:1] offset:160
+; GFX906-NEXT: global_load_dwordx4 v[21:24], v61, s[0:1] offset:144
; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[0:1] offset:208
-; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[0:1] offset:192
-; GFX906-NEXT: global_load_dwordx4 v[17:20], v61, s[0:1] offset:176
-; GFX906-NEXT: global_load_dwordx4 v[21:24], v61, s[0:1] offset:160
-; GFX906-NEXT: global_load_dwordx4 v[25:28], v61, s[0:1] offset:144
-; GFX906-NEXT: global_load_dwordx4 v[29:32], v61, s[0:1] offset:128
-; GFX906-NEXT: global_load_dwordx4 v[33:36], v61, s[0:1] offset:112
-; GFX906-NEXT: global_load_dwordx4 v[37:40], v61, s[0:1] offset:96
-; GFX906-NEXT: global_load_dwordx4 v[41:44], v61, s[0:1] offset:80
-; GFX906-NEXT: global_load_dwordx4 v[45:48], v61, s[0:1] offset:64
-; GFX906-NEXT: global_load_dwordx4 v[49:52], v61, s[0:1] offset:48
-; GFX906-NEXT: global_load_dwordx4 v[53:56], v61, s[0:1] offset:32
-; GFX906-NEXT: global_load_dwordx4 v[57:60], v61, s[0:1] offset:16
-; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[0:1]
+; GFX906-NEXT: global_load_dwordx4 v[17:20], v61, s[0:1] offset:128
+; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[0:1] offset:112
+; GFX906-NEXT: global_load_dwordx4 v[57:60], v61, s[0:1] offset:96
+; GFX906-NEXT: global_load_dwordx4 v[53:56], v61, s[0:1] offset:80
+; GFX906-NEXT: global_load_dwordx4 v[49:52], v61, s[0:1] offset:64
+; GFX906-NEXT: global_load_dwordx4 v[45:48], v61, s[0:1] offset:48
+; GFX906-NEXT: global_load_dwordx4 v[41:44], v61, s[0:1] offset:32
+; GFX906-NEXT: global_load_dwordx4 v[37:40], v61, s[0:1] offset:16
+; GFX906-NEXT: global_load_dwordx4 v[33:36], v61, s[0:1]
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB6_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[2:3] offset:240
+; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[2:3] offset:240
+; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[2:3] offset:224
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[2:3] offset:208
+; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[2:3] offset:192
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
; GFX906-NEXT: s_nop 0
; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[2:3] offset:224
-; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[2:3] offset:208
-; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[2:3] offset:192
-; GFX906-NEXT: global_load_dwordx4 v[17:20], v61, s[2:3] offset:176
-; GFX906-NEXT: global_load_dwordx4 v[21:24], v61, s[2:3] offset:160
-; GFX906-NEXT: global_load_dwordx4 v[25:28], v61, s[2:3] offset:144
-; GFX906-NEXT: global_load_dwordx4 v[29:32], v61, s[2:3] offset:128
-; GFX906-NEXT: global_load_dwordx4 v[33:36], v61, s[2:3] offset:112
-; GFX906-NEXT: global_load_dwordx4 v[37:40], v61, s[2:3] offset:96
-; GFX906-NEXT: global_load_dwordx4 v[41:44], v61, s[2:3] offset:80
-; GFX906-NEXT: global_load_dwordx4 v[45:48], v61, s[2:3] offset:64
-; GFX906-NEXT: global_load_dwordx4 v[49:52], v61, s[2:3] offset:48
-; GFX906-NEXT: global_load_dwordx4 v[53:56], v61, s[2:3] offset:32
-; GFX906-NEXT: global_load_dwordx4 v[57:60], v61, s[2:3] offset:16
-; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[2:3]
+; GFX906-NEXT: global_load_dwordx4 v[29:32], v61, s[2:3] offset:176
+; GFX906-NEXT: global_load_dwordx4 v[25:28], v61, s[2:3] offset:160
+; GFX906-NEXT: global_load_dwordx4 v[21:24], v61, s[2:3] offset:144
+; GFX906-NEXT: global_load_dwordx4 v[17:20], v61, s[2:3] offset:128
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[2:3] offset:112
+; GFX906-NEXT: global_load_dwordx4 v[57:60], v61, s[2:3] offset:96
+; GFX906-NEXT: global_load_dwordx4 v[53:56], v61, s[2:3] offset:80
+; GFX906-NEXT: global_load_dwordx4 v[49:52], v61, s[2:3] offset:64
+; GFX906-NEXT: global_load_dwordx4 v[45:48], v61, s[2:3] offset:48
+; GFX906-NEXT: global_load_dwordx4 v[41:44], v61, s[2:3] offset:32
+; GFX906-NEXT: global_load_dwordx4 v[37:40], v61, s[2:3] offset:16
+; GFX906-NEXT: global_load_dwordx4 v[33:36], v61, s[2:3]
; GFX906-NEXT: .LBB6_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[6:7] offset:112
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:112
; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[6:7] offset:96
+; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[6:7] offset:96
; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[6:7] offset:80
+; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[6:7] offset:80
; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[6:7] offset:64
+; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[6:7] offset:64
; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[6:7] offset:48
+; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[6:7] offset:48
; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[6:7] offset:32
+; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[6:7] offset:32
; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[6:7] offset:16
+; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[6:7] offset:16
; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[6:7]
+; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[6:7] offset:240
+; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[6:7] offset:224
+; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[6:7] offset:208
; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
-; GFX906-NEXT: s_nop 0
; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:240
-; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[6:7] offset:224
-; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[6:7] offset:208
-; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[6:7] offset:192
-; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[6:7] offset:176
-; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[6:7] offset:160
-; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[6:7] offset:144
-; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[6:7] offset:128
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:192
+; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[6:7] offset:176
+; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[6:7] offset:160
+; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[6:7] offset:144
+; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[6:7] offset:128
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index 82fae44e20818..4290590e99711 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -8,14 +8,13 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) {
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dword s1, s[0:1], 0x0
-; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: s_load_dword s2, s[0:1], 0x0
+; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_addk_i32 s1, 0x3e7
-; SI-NEXT: s_or_b32 s4, s1, 4
-; SI-NEXT: s_mov_b32 s1, s0
+; SI-NEXT: s_addk_i32 s2, 0x3e7
+; SI-NEXT: s_or_b32 s4, s2, 4
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -59,15 +58,14 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dword s1, s[0:1], 0x0
-; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: s_load_dword s2, s[0:1], 0x0
+; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s1, s1, 0xffff
-; SI-NEXT: s_addk_i32 s1, 0x3e7
-; SI-NEXT: s_or_b32 s4, s1, 4
-; SI-NEXT: s_mov_b32 s1, s0
+; SI-NEXT: s_and_b32 s2, s2, 0xffff
+; SI-NEXT: s_addk_i32 s2, 0x3e7
+; SI-NEXT: s_or_b32 s4, s2, 4
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -115,15 +113,14 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dword s1, s[0:1], 0x0
-; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: s_load_dword s2, s[0:1], 0x0
+; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_sext_i32_i16 s1, s1
-; SI-NEXT: s_addk_i32 s1, 0x3e7
-; SI-NEXT: s_or_b32 s4, s1, 4
-; SI-NEXT: s_mov_b32 s1, s0
+; SI-NEXT: s_sext_i32_i16 s2, s2
+; SI-NEXT: s_addk_i32 s2, 0x3e7
+; SI-NEXT: s_or_b32 s4, s2, 4
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -169,26 +166,22 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar
define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) {
; SI-LABEL: widen_i17_constant_load:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s1, s0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dword s7, s[4:5], 0x0
-; SI-NEXT: s_mov_b32 s4, 2
-; SI-NEXT: s_mov_b32 s5, s0
-; SI-NEXT: s_mov_b32 s6, s2
+; SI-NEXT: s_load_dword s4, s[0:1], 0x0
+; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_add_i32 s7, s7, 34
-; SI-NEXT: s_or_b32 s7, s7, 4
-; SI-NEXT: v_mov_b32_e32 v0, s7
-; SI-NEXT: s_bfe_u32 s8, s7, 0x10010
+; SI-NEXT: s_add_i32 s4, s4, 34
+; SI-NEXT: s_or_b32 s4, s4, 4
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: s_bfe_u32 s4, s4, 0x10010
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
-; SI-NEXT: s_mov_b32 s7, s3
+; SI-NEXT: s_mov_b64 s[0:1], 2
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s8
-; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: widen_i17_constant_load:
@@ -246,8 +239,7 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) {
; SI-NEXT: s_load_dword s0, s[0:1], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: s_mov_b32 s1, s0
+; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: v_add_f32_e32 v0, 4.0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -288,19 +280,18 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) {
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dword s1, s[0:1], 0x0
-; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: s_load_dword s2, s[0:1], 0x0
+; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s4, s1, 0xff00
-; SI-NEXT: s_add_i32 s1, s1, 12
-; SI-NEXT: s_or_b32 s1, s1, 4
-; SI-NEXT: s_and_b32 s1, s1, 0xff
-; SI-NEXT: s_or_b32 s1, s4, s1
-; SI-NEXT: s_addk_i32 s1, 0x2c00
-; SI-NEXT: s_or_b32 s4, s1, 0x300
-; SI-NEXT: s_mov_b32 s1, s0
+; SI-NEXT: s_and_b32 s4, s2, 0xff00
+; SI-NEXT: s_add_i32 s2, s2, 12
+; SI-NEXT: s_or_b32 s2, s2, 4
+; SI-NEXT: s_and_b32 s2, s2, 0xff
+; SI-NEXT: s_or_b32 s2, s4, s2
+; SI-NEXT: s_addk_i32 s2, 0x2c00
+; SI-NEXT: s_or_b32 s4, s2, 0x300
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -361,14 +352,12 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4)
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s4, s2
-; SI-NEXT: s_mov_b32 s5, s2
-; SI-NEXT: s_mov_b32 s7, s3
+; SI-NEXT: s_mov_b64 s[0:1], 0
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3e7, v0
; SI-NEXT: v_or_b32_e32 v0, 4, v0
-; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: no_widen_i16_constant_divergent_load:
@@ -419,13 +408,12 @@ define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) {
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dword s1, s[0:1], 0x0
-; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: s_load_dword s2, s[0:1], 0x0
+; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s4, s1, 1
-; SI-NEXT: s_mov_b32 s1, s0
+; SI-NEXT: s_and_b32 s4, s2, 1
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -466,15 +454,14 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4)
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dword s1, s[0:1], 0x0
-; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: s_load_dword s2, s[0:1], 0x0
+; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s1, s1, 0xffff
-; SI-NEXT: s_addk_i32 s1, 0x3e7
-; SI-NEXT: s_or_b32 s4, s1, 4
-; SI-NEXT: s_mov_b32 s1, s0
+; SI-NEXT: s_and_b32 s2, s2, 0xffff
+; SI-NEXT: s_addk_i32 s2, 0x3e7
+; SI-NEXT: s_or_b32 s4, s2, 4
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -522,16 +509,15 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) %
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dword s1, s[0:1], 0x0
-; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: s_load_dword s2, s[0:1], 0x0
+; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s1, s1, 1
-; SI-NEXT: s_add_u32 s4, s1, 0x3e7
+; SI-NEXT: s_and_b32 s2, s2, 1
+; SI-NEXT: s_add_u32 s4, s2, 0x3e7
; SI-NEXT: s_addc_u32 s5, 0, 0
; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: s_mov_b32 s1, s0
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -579,13 +565,13 @@ define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) {
; SI-NEXT: s_load_dword s0, s[4:5], 0x9
; SI-NEXT: s_mov_b32 s1, 0
; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dword s0, s[0:1], 0x0
+; SI-NEXT: s_load_dword s2, s[0:1], 0x0
+; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_addk_i32 s0, 0x3e7
-; SI-NEXT: s_or_b32 s4, s0, 4
-; SI-NEXT: s_mov_b32 s0, s1
+; SI-NEXT: s_addk_i32 s2, 0x3e7
+; SI-NEXT: s_or_b32 s4, s2, 4
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -631,14 +617,13 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dword s1, s[0:1], 0x0
-; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: s_load_dword s2, s[0:1], 0x0
+; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_addk_i32 s1, 0x3e7
-; SI-NEXT: s_or_b32 s4, s1, 1
-; SI-NEXT: s_mov_b32 s1, s0
+; SI-NEXT: s_addk_i32 s2, 0x3e7
+; SI-NEXT: s_or_b32 s4, s2, 1
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll
index 2f09c98891d03..16dc5a81e782b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll
@@ -43,21 +43,21 @@ define arm_aapcs_vfpcc <4 x float> @add_mul(<4 x float> %a, <4 x float> %b, <4 x
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vsub.f32 q3, q1, q2
; CHECK-NEXT: vsub.f32 q0, q1, q0
-; CHECK-NEXT: vmov.f32 s16, s9
+; CHECK-NEXT: vmov.f32 s4, s9
; CHECK-NEXT: vmov.f32 s13, s14
-; CHECK-NEXT: vmov.f32 s17, s11
+; CHECK-NEXT: vmov.f32 s5, s11
; CHECK-NEXT: vmov.f32 s0, s1
-; CHECK-NEXT: vmul.f32 q1, q3, q4
+; CHECK-NEXT: vmul.f32 q4, q3, q1
; CHECK-NEXT: vmov.f32 s1, s3
; CHECK-NEXT: vmov.f32 s9, s10
-; CHECK-NEXT: vfma.f32 q1, q2, q0
-; CHECK-NEXT: vmul.f32 q0, q4, q0
-; CHECK-NEXT: vneg.f32 q4, q0
-; CHECK-NEXT: vmov.f32 s1, s4
-; CHECK-NEXT: vfma.f32 q4, q2, q3
-; CHECK-NEXT: vmov.f32 s3, s5
-; CHECK-NEXT: vmov.f32 s0, s16
-; CHECK-NEXT: vmov.f32 s2, s17
+; CHECK-NEXT: vfma.f32 q4, q2, q0
+; CHECK-NEXT: vmul.f32 q0, q1, q0
+; CHECK-NEXT: vneg.f32 q1, q0
+; CHECK-NEXT: vmov.f32 s1, s16
+; CHECK-NEXT: vfma.f32 q1, q2, q3
+; CHECK-NEXT: vmov.f32 s3, s17
+; CHECK-NEXT: vmov.f32 s0, s4
+; CHECK-NEXT: vmov.f32 s2, s5
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr
entry:
@@ -81,41 +81,35 @@ entry:
define arm_aapcs_vfpcc <4 x float> @mul_mul270_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
; CHECK-LABEL: mul_mul270_mul:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d12}
-; CHECK-NEXT: vpush {d12}
-; CHECK-NEXT: .vsave {d10}
-; CHECK-NEXT: vpush {d10}
-; CHECK-NEXT: .vsave {d8}
-; CHECK-NEXT: vpush {d8}
-; CHECK-NEXT: vmov.f32 s20, s4
+; CHECK-NEXT: .vsave {d8, d9, d10}
+; CHECK-NEXT: vpush {d8, d9, d10}
; CHECK-NEXT: vmov.f32 s16, s8
+; CHECK-NEXT: vmov.f32 s20, s4
; CHECK-NEXT: vmov.f32 s17, s10
; CHECK-NEXT: vmov.f32 s21, s6
; CHECK-NEXT: vmul.f32 q3, q5, q4
; CHECK-NEXT: vmov.f32 s4, s5
-; CHECK-NEXT: vneg.f32 q3, q3
-; CHECK-NEXT: vmov.f32 s24, s9
-; CHECK-NEXT: vmov.f32 s25, s11
; CHECK-NEXT: vmov.f32 s5, s7
-; CHECK-NEXT: vmul.f32 q2, q1, q4
-; CHECK-NEXT: vmov.f32 s16, s0
-; CHECK-NEXT: vfma.f32 q3, q1, q6
-; CHECK-NEXT: vmov.f32 s17, s2
+; CHECK-NEXT: vneg.f32 q3, q3
+; CHECK-NEXT: vmov.f32 s8, s9
+; CHECK-NEXT: vmul.f32 q4, q1, q4
+; CHECK-NEXT: vmov.f32 s9, s11
+; CHECK-NEXT: vfma.f32 q3, q1, q2
+; CHECK-NEXT: vfma.f32 q4, q5, q2
+; CHECK-NEXT: vmov.f32 s8, s0
+; CHECK-NEXT: vmov.f32 s9, s2
; CHECK-NEXT: vmov.f32 s0, s1
-; CHECK-NEXT: vfma.f32 q2, q5, q6
-; CHECK-NEXT: vmul.f32 q1, q3, q4
+; CHECK-NEXT: vmul.f32 q1, q3, q2
; CHECK-NEXT: vmov.f32 s1, s3
-; CHECK-NEXT: vfma.f32 q1, q2, q0
+; CHECK-NEXT: vfma.f32 q1, q4, q0
; CHECK-NEXT: vmul.f32 q0, q3, q0
; CHECK-NEXT: vneg.f32 q3, q0
; CHECK-NEXT: vmov.f32 s1, s4
-; CHECK-NEXT: vfma.f32 q3, q2, q4
+; CHECK-NEXT: vfma.f32 q3, q4, q2
; CHECK-NEXT: vmov.f32 s3, s5
; CHECK-NEXT: vmov.f32 s0, s12
; CHECK-NEXT: vmov.f32 s2, s13
-; CHECK-NEXT: vpop {d8}
-; CHECK-NEXT: vpop {d10}
-; CHECK-NEXT: vpop {d12}
+; CHECK-NEXT: vpop {d8, d9, d10}
; CHECK-NEXT: bx lr
entry:
%strided.vec = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
index e86c368e0fe8a..7be08b04c5957 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
@@ -12,9 +12,9 @@ define arm_aapcs_vfpcc <4 x i32> @loads_i32(ptr %A, ptr %B, ptr %C) {
; CHECK-NEXT: vmov.f32 s2, s11
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vmov.f32 s10, s9
-; CHECK-NEXT: vmov r3, r4, d0
+; CHECK-NEXT: vmov r1, r3, d0
; CHECK-NEXT: vand q2, q2, q1
-; CHECK-NEXT: vmov r5, r1, d1
+; CHECK-NEXT: vmov r4, r5, d1
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vmov lr, r12, d5
@@ -24,15 +24,15 @@ define arm_aapcs_vfpcc <4 x i32> @loads_i32(ptr %A, ptr %B, ptr %C) {
; CHECK-NEXT: vmov.f32 s12, s6
; CHECK-NEXT: vmov.f32 s6, s7
; CHECK-NEXT: asrs r2, r0, #31
-; CHECK-NEXT: adds r0, r0, r3
-; CHECK-NEXT: adc.w r3, r2, r4
+; CHECK-NEXT: adds r0, r0, r1
+; CHECK-NEXT: adc.w r1, r2, r3
; CHECK-NEXT: vmov r2, s12
-; CHECK-NEXT: asrl r0, r3, r2
-; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: asrl r0, r1, r2
+; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: vmov.f32 s2, s1
-; CHECK-NEXT: asrs r3, r2, #31
-; CHECK-NEXT: adds r2, r2, r5
-; CHECK-NEXT: adcs r1, r3
+; CHECK-NEXT: adds r2, r1, r4
+; CHECK-NEXT: asr.w r3, r1, #31
+; CHECK-NEXT: adc.w r1, r3, r5
; CHECK-NEXT: vmov r3, s6
; CHECK-NEXT: asrl r2, r1, r3
; CHECK-NEXT: vmov r4, r5, d4
@@ -136,39 +136,39 @@ define arm_aapcs_vfpcc void @load_store_i32(ptr %A, ptr %B, ptr %C, ptr %D) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: .vsave {d8}
-; CHECK-NEXT: vpush {d8}
-; CHECK-NEXT: vldrw.u32 q2, [r1]
+; CHECK-NEXT: .vsave {d8, d9}
+; CHECK-NEXT: vpush {d8, d9}
+; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vmov.i64 q0, #0xffffffff
-; CHECK-NEXT: vmov.f32 s4, s10
-; CHECK-NEXT: vmov.f32 s6, s11
-; CHECK-NEXT: vmov.f32 s10, s9
-; CHECK-NEXT: vand q1, q1, q0
-; CHECK-NEXT: vand q2, q2, q0
+; CHECK-NEXT: vmov.f32 s8, s6
+; CHECK-NEXT: vmov.f32 s10, s7
+; CHECK-NEXT: vmov.f32 s6, s5
+; CHECK-NEXT: vand q4, q2, q0
+; CHECK-NEXT: vand q2, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: vmov r6, r4, d3
+; CHECK-NEXT: vmov r4, r5, d9
+; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vmov.f32 s12, s2
; CHECK-NEXT: vmov.f32 s2, s3
-; CHECK-NEXT: vmov lr, r12, d2
-; CHECK-NEXT: vldrw.u32 q1, [r2]
-; CHECK-NEXT: vmov r5, r1, d5
+; CHECK-NEXT: vmov lr, r12, d8
; CHECK-NEXT: vmov.f32 s16, s6
; CHECK-NEXT: vmov.f32 s6, s7
+; CHECK-NEXT: vmov r6, r1, d5
; CHECK-NEXT: vmov.f32 s10, s1
; CHECK-NEXT: vmov r0, s2
; CHECK-NEXT: vmov.f32 s2, s5
-; CHECK-NEXT: adds.w r8, r0, r6
+; CHECK-NEXT: adds.w r8, r0, r4
; CHECK-NEXT: asr.w r2, r0, #31
-; CHECK-NEXT: adc.w r7, r2, r4
+; CHECK-NEXT: adcs r5, r2
; CHECK-NEXT: vmov r2, s6
-; CHECK-NEXT: asrl r8, r7, r2
+; CHECK-NEXT: asrl r8, r5, r2
; CHECK-NEXT: vmov r2, s10
+; CHECK-NEXT: vmov r5, r7, d4
; CHECK-NEXT: asrs r4, r2, #31
-; CHECK-NEXT: adds r2, r2, r5
+; CHECK-NEXT: adds r2, r2, r6
; CHECK-NEXT: adcs r1, r4
; CHECK-NEXT: vmov r4, s2
; CHECK-NEXT: asrl r2, r1, r4
-; CHECK-NEXT: vmov r5, r7, d4
; CHECK-NEXT: vmov r1, s12
; CHECK-NEXT: adds.w r6, r1, lr
; CHECK-NEXT: asr.w r4, r1, #31
@@ -184,7 +184,7 @@ define arm_aapcs_vfpcc void @load_store_i32(ptr %A, ptr %B, ptr %C, ptr %D) {
; CHECK-NEXT: vmov q0[2], q0[0], r0, r6
; CHECK-NEXT: vmov q0[3], q0[1], r2, r8
; CHECK-NEXT: vstrw.32 q0, [r3]
-; CHECK-NEXT: vpop {d8}
+; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
entry:
%a = load <4 x i32>, ptr %A, align 4
diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
index 33816fec69bac..feda774ab0e65 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
@@ -355,30 +355,30 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle3step_i16(<32 x i16> %src) {
; CHECK-NEXT: vins.f16 s16, s2
; CHECK-NEXT: vmovx.f16 s2, s5
; CHECK-NEXT: vmov.f32 s17, s4
-; CHECK-NEXT: vmovx.f16 s13, s3
+; CHECK-NEXT: vmovx.f16 s14, s6
; CHECK-NEXT: vins.f16 s17, s2
; CHECK-NEXT: vmovx.f16 s2, s8
; CHECK-NEXT: vmov.f32 s18, s7
-; CHECK-NEXT: vmovx.f16 s14, s6
+; CHECK-NEXT: vins.f16 s14, s8
; CHECK-NEXT: vins.f16 s18, s2
; CHECK-NEXT: vmovx.f16 s2, s11
-; CHECK-NEXT: vmov.f32 s19, s10
-; CHECK-NEXT: vmovx.f16 s15, s9
-; CHECK-NEXT: vins.f16 s19, s2
+; CHECK-NEXT: vmovx.f16 s8, s10
+; CHECK-NEXT: vins.f16 s10, s2
; CHECK-NEXT: vmovx.f16 s2, s1
+; CHECK-NEXT: vmovx.f16 s13, s3
; CHECK-NEXT: vins.f16 s0, s2
; CHECK-NEXT: vmovx.f16 s2, s4
; CHECK-NEXT: vins.f16 s3, s2
; CHECK-NEXT: vmovx.f16 s2, s7
+; CHECK-NEXT: vmovx.f16 s15, s9
; CHECK-NEXT: vins.f16 s6, s2
-; CHECK-NEXT: vmovx.f16 s2, s10
-; CHECK-NEXT: vins.f16 s9, s2
+; CHECK-NEXT: vins.f16 s9, s8
; CHECK-NEXT: vmov.f32 s1, s3
-; CHECK-NEXT: vins.f16 s14, s8
; CHECK-NEXT: vins.f16 s15, s11
; CHECK-NEXT: vins.f16 s13, s5
; CHECK-NEXT: vmov.f32 s2, s6
; CHECK-NEXT: vmov.f32 s3, s9
+; CHECK-NEXT: vmov.f32 s19, s10
; CHECK-NEXT: vadd.i16 q0, q0, q3
; CHECK-NEXT: vadd.i16 q0, q0, q4
; CHECK-NEXT: vpop {d8, d9}
@@ -1467,47 +1467,27 @@ entry:
ret <2 x double> %out
}
define arm_aapcs_vfpcc <8 x double> @shuffle9_f64(<4 x double> %src1, <4 x double> %src2) {
-; CHECK-LV-LABEL: shuffle9_f64:
-; CHECK-LV: @ %bb.0: @ %entry
-; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11}
-; CHECK-LV-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-LV-NEXT: vmov q5, q2
-; CHECK-LV-NEXT: vmov.f32 s16, s0
-; CHECK-LV-NEXT: vmov.f32 s18, s20
-; CHECK-LV-NEXT: vmov.f32 s20, s2
-; CHECK-LV-NEXT: vmov.f32 s10, s12
-; CHECK-LV-NEXT: vmov.f32 s19, s21
-; CHECK-LV-NEXT: vmov.f32 s8, s4
-; CHECK-LV-NEXT: vmov.f32 s17, s1
-; CHECK-LV-NEXT: vmov.f32 s21, s3
-; CHECK-LV-NEXT: vmov q0, q4
-; CHECK-LV-NEXT: vmov.f32 s12, s6
-; CHECK-LV-NEXT: vmov.f32 s11, s13
-; CHECK-LV-NEXT: vmov.f32 s9, s5
-; CHECK-LV-NEXT: vmov.f32 s13, s7
-; CHECK-LV-NEXT: vmov q1, q5
-; CHECK-LV-NEXT: vpop {d8, d9, d10, d11}
-; CHECK-LV-NEXT: bx lr
-;
-; CHECK-LIS-LABEL: shuffle9_f64:
-; CHECK-LIS: @ %bb.0: @ %entry
-; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11}
-; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-LIS-NEXT: vmov q5, q2
-; CHECK-LIS-NEXT: vmov q4, q0
-; CHECK-LIS-NEXT: vmov.f32 s2, s20
-; CHECK-LIS-NEXT: vmov.f32 s20, s18
-; CHECK-LIS-NEXT: vmov.f32 s10, s12
-; CHECK-LIS-NEXT: vmov.f32 s3, s21
-; CHECK-LIS-NEXT: vmov.f32 s8, s4
-; CHECK-LIS-NEXT: vmov.f32 s21, s19
-; CHECK-LIS-NEXT: vmov.f32 s12, s6
-; CHECK-LIS-NEXT: vmov.f32 s11, s13
-; CHECK-LIS-NEXT: vmov.f32 s9, s5
-; CHECK-LIS-NEXT: vmov.f32 s13, s7
-; CHECK-LIS-NEXT: vmov q1, q5
-; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11}
-; CHECK-LIS-NEXT: bx lr
+; CHECK-LABEL: shuffle9_f64:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .vsave {d8, d9, d10, d11}
+; CHECK-NEXT: vpush {d8, d9, d10, d11}
+; CHECK-NEXT: vmov q5, q2
+; CHECK-NEXT: vmov.f32 s16, s0
+; CHECK-NEXT: vmov.f32 s18, s20
+; CHECK-NEXT: vmov.f32 s20, s2
+; CHECK-NEXT: vmov.f32 s10, s12
+; CHECK-NEXT: vmov.f32 s19, s21
+; CHECK-NEXT: vmov.f32 s8, s4
+; CHECK-NEXT: vmov.f32 s17, s1
+; CHECK-NEXT: vmov.f32 s21, s3
+; CHECK-NEXT: vmov q0, q4
+; CHECK-NEXT: vmov.f32 s12, s6
+; CHECK-NEXT: vmov.f32 s11, s13
+; CHECK-NEXT: vmov.f32 s9, s5
+; CHECK-NEXT: vmov.f32 s13, s7
+; CHECK-NEXT: vmov q1, q5
+; CHECK-NEXT: vpop {d8, d9, d10, d11}
+; CHECK-NEXT: bx lr
entry:
%out = shufflevector <4 x double> %src1, <4 x double> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
ret <8 x double> %out
@@ -1580,47 +1560,27 @@ entry:
ret <2 x i64> %out
}
define arm_aapcs_vfpcc <8 x i64> @shuffle9_i64(<4 x i64> %src1, <4 x i64> %src2) {
-; CHECK-LV-LABEL: shuffle9_i64:
-; CHECK-LV: @ %bb.0: @ %entry
-; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11}
-; CHECK-LV-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-LV-NEXT: vmov q5, q2
-; CHECK-LV-NEXT: vmov.f32 s16, s0
-; CHECK-LV-NEXT: vmov.f32 s18, s20
-; CHECK-LV-NEXT: vmov.f32 s20, s2
-; CHECK-LV-NEXT: vmov.f32 s10, s12
-; CHECK-LV-NEXT: vmov.f32 s19, s21
-; CHECK-LV-NEXT: vmov.f32 s8, s4
-; CHECK-LV-NEXT: vmov.f32 s17, s1
-; CHECK-LV-NEXT: vmov.f32 s21, s3
-; CHECK-LV-NEXT: vmov q0, q4
-; CHECK-LV-NEXT: vmov.f32 s12, s6
-; CHECK-LV-NEXT: vmov.f32 s11, s13
-; CHECK-LV-NEXT: vmov.f32 s9, s5
-; CHECK-LV-NEXT: vmov.f32 s13, s7
-; CHECK-LV-NEXT: vmov q1, q5
-; CHECK-LV-NEXT: vpop {d8, d9, d10, d11}
-; CHECK-LV-NEXT: bx lr
-;
-; CHECK-LIS-LABEL: shuffle9_i64:
-; CHECK-LIS: @ %bb.0: @ %entry
-; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11}
-; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-LIS-NEXT: vmov q5, q2
-; CHECK-LIS-NEXT: vmov q4, q0
-; CHECK-LIS-NEXT: vmov.f32 s2, s20
-; CHECK-LIS-NEXT: vmov.f32 s20, s18
-; CHECK-LIS-NEXT: vmov.f32 s10, s12
-; CHECK-LIS-NEXT: vmov.f32 s3, s21
-; CHECK-LIS-NEXT: vmov.f32 s8, s4
-; CHECK-LIS-NEXT: vmov.f32 s21, s19
-; CHECK-LIS-NEXT: vmov.f32 s12, s6
-; CHECK-LIS-NEXT: vmov.f32 s11, s13
-; CHECK-LIS-NEXT: vmov.f32 s9, s5
-; CHECK-LIS-NEXT: vmov.f32 s13, s7
-; CHECK-LIS-NEXT: vmov q1, q5
-; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11}
-; CHECK-LIS-NEXT: bx lr
+; CHECK-LABEL: shuffle9_i64:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .vsave {d8, d9, d10, d11}
+; CHECK-NEXT: vpush {d8, d9, d10, d11}
+; CHECK-NEXT: vmov q5, q2
+; CHECK-NEXT: vmov.f32 s16, s0
+; CHECK-NEXT: vmov.f32 s18, s20
+; CHECK-NEXT: vmov.f32 s20, s2
+; CHECK-NEXT: vmov.f32 s10, s12
+; CHECK-NEXT: vmov.f32 s19, s21
+; CHECK-NEXT: vmov.f32 s8, s4
+; CHECK-NEXT: vmov.f32 s17, s1
+; CHECK-NEXT: vmov.f32 s21, s3
+; CHECK-NEXT: vmov q0, q4
+; CHECK-NEXT: vmov.f32 s12, s6
+; CHECK-NEXT: vmov.f32 s11, s13
+; CHECK-NEXT: vmov.f32 s9, s5
+; CHECK-NEXT: vmov.f32 s13, s7
+; CHECK-NEXT: vmov q1, q5
+; CHECK-NEXT: vpop {d8, d9, d10, d11}
+; CHECK-NEXT: bx lr
entry:
%out = shufflevector <4 x i64> %src1, <4 x i64> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
ret <8 x i64> %out
@@ -1860,3 +1820,6 @@ entry:
ret double %res
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-LIS: {{.*}}
+; CHECK-LV: {{.*}}
diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
index 1279714b5a78c..042a6ea18412a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
@@ -374,48 +374,48 @@ define void @vabd_loop_s32(ptr nocapture readonly %x, ptr nocapture readonly %y,
; CHECK-NEXT: .LBB17_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
+; CHECK-NEXT: vldrw.u32 q3, [r1], #16
; CHECK-NEXT: vmov.f32 s8, s6
+; CHECK-NEXT: vmov.f32 s6, s5
; CHECK-NEXT: vmov r7, s4
-; CHECK-NEXT: vmov.f32 s6, s7
+; CHECK-NEXT: vmov.f32 s10, s7
; CHECK-NEXT: vmov r3, s8
-; CHECK-NEXT: vldrw.u32 q2, [r1], #16
-; CHECK-NEXT: vmov.f32 s12, s10
-; CHECK-NEXT: vmov.f32 s10, s5
-; CHECK-NEXT: vmov.f32 s14, s11
-; CHECK-NEXT: vmov r4, s12
+; CHECK-NEXT: vmov.f32 s8, s14
+; CHECK-NEXT: vmov.f32 s14, s15
+; CHECK-NEXT: vmov r4, s8
; CHECK-NEXT: asr.w r12, r3, #31
; CHECK-NEXT: subs.w r8, r3, r4
; CHECK-NEXT: sbc.w r12, r12, r4, asr #31
+; CHECK-NEXT: vmov r4, s6
+; CHECK-NEXT: vmov.f32 s6, s13
+; CHECK-NEXT: vmov r6, s6
+; CHECK-NEXT: asrs r5, r4, #31
+; CHECK-NEXT: subs.w r9, r4, r6
; CHECK-NEXT: vmov r4, s10
-; CHECK-NEXT: vmov.f32 s10, s9
-; CHECK-NEXT: vmov r6, s10
-; CHECK-NEXT: asrs r3, r4, #31
-; CHECK-NEXT: subs r4, r4, r6
-; CHECK-NEXT: sbc.w r9, r3, r6, asr #31
-; CHECK-NEXT: vmov r6, s8
-; CHECK-NEXT: vmov r3, s6
-; CHECK-NEXT: subs r5, r7, r6
+; CHECK-NEXT: sbc.w r5, r5, r6, asr #31
+; CHECK-NEXT: vmov r6, s12
+; CHECK-NEXT: asrs r5, r5, #31
+; CHECK-NEXT: subs r3, r7, r6
; CHECK-NEXT: asr.w r7, r7, #31
-; CHECK-NEXT: vmov q2[2], q2[0], r5, r8
-; CHECK-NEXT: vmov r5, s14
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r8
+; CHECK-NEXT: vmov r3, s14
; CHECK-NEXT: sbc.w r6, r7, r6, asr #31
; CHECK-NEXT: asrs r6, r6, #31
-; CHECK-NEXT: subs r7, r3, r5
-; CHECK-NEXT: asr.w r3, r3, #31
-; CHECK-NEXT: vmov q2[3], q2[1], r4, r7
+; CHECK-NEXT: subs r7, r4, r3
+; CHECK-NEXT: vmov q1[3], q1[1], r9, r7
; CHECK-NEXT: mov.w r7, #0
-; CHECK-NEXT: sbc.w r3, r3, r5, asr #31
; CHECK-NEXT: bfi r7, r6, #0, #4
-; CHECK-NEXT: asr.w r4, r9, #31
; CHECK-NEXT: asr.w r6, r12, #31
-; CHECK-NEXT: bfi r7, r4, #4, #4
-; CHECK-NEXT: asrs r3, r3, #31
+; CHECK-NEXT: bfi r7, r5, #4, #4
; CHECK-NEXT: bfi r7, r6, #8, #4
+; CHECK-NEXT: asr.w r6, r4, #31
+; CHECK-NEXT: sbc.w r3, r6, r3, asr #31
+; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: bfi r7, r3, #12, #4
; CHECK-NEXT: vmsr p0, r7
; CHECK-NEXT: vpst
-; CHECK-NEXT: vsubt.i32 q2, q0, q2
-; CHECK-NEXT: vstrb.8 q2, [r2], #16
+; CHECK-NEXT: vsubt.i32 q1, q0, q1
+; CHECK-NEXT: vstrb.8 q1, [r2], #16
; CHECK-NEXT: le lr, .LBB17_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll
index 36a035d56bcfa..dca4fb3d6cfa3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll
@@ -346,23 +346,23 @@ define void @vld2_v4i64(ptr %src, ptr %dst) {
; CHECK-NEXT: vpush {d8}
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT: vldrw.u32 q3, [r0, #32]
; CHECK-NEXT: vmov.f32 s4, s2
; CHECK-NEXT: vmov.f32 s5, s3
-; CHECK-NEXT: vmov lr, r12, d5
; CHECK-NEXT: vmov.f32 s2, s8
; CHECK-NEXT: vmov.f32 s3, s9
-; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
-; CHECK-NEXT: vmov.f32 s16, s10
-; CHECK-NEXT: vmov.f32 s17, s11
-; CHECK-NEXT: vmov r5, r6, d4
+; CHECK-NEXT: vmov.f32 s16, s14
+; CHECK-NEXT: vmov.f32 s17, s15
+; CHECK-NEXT: vmov lr, r12, d5
+; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
+; CHECK-NEXT: vmov r5, r6, d6
; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: vmov.f32 s2, s12
-; CHECK-NEXT: vmov.f32 s3, s13
+; CHECK-NEXT: vmov.f32 s2, s8
; CHECK-NEXT: vmov r0, r7, d8
+; CHECK-NEXT: vmov.f32 s3, s9
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: adc.w r12, r12, r3
-; CHECK-NEXT: vmov r3, r4, d7
+; CHECK-NEXT: vmov r3, r4, d5
; CHECK-NEXT: adds r0, r0, r5
; CHECK-NEXT: adc.w r8, r6, r7
; CHECK-NEXT: vmov r6, r5, d1
diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
index b207ce7bdefd1..0c58abaa1c86e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
@@ -388,34 +388,34 @@ define void @vld3_v16i16(ptr %src, ptr %dst) {
; CHECK-LV-NEXT: vmov.f32 s22, s11
; CHECK-LV-NEXT: vins.f16 s14, s4
; CHECK-LV-NEXT: vmovx.f16 s4, s4
+; CHECK-LV-NEXT: vmovx.f16 s15, s5
; CHECK-LV-NEXT: vins.f16 s22, s4
; CHECK-LV-NEXT: vmovx.f16 s4, s7
-; CHECK-LV-NEXT: vmov.f32 s23, s6
-; CHECK-LV-NEXT: vmovx.f16 s12, s16
-; CHECK-LV-NEXT: vins.f16 s23, s4
+; CHECK-LV-NEXT: vins.f16 s15, s7
+; CHECK-LV-NEXT: vmovx.f16 s7, s6
+; CHECK-LV-NEXT: vins.f16 s6, s4
; CHECK-LV-NEXT: vmovx.f16 s4, s17
+; CHECK-LV-NEXT: vmovx.f16 s12, s16
; CHECK-LV-NEXT: vins.f16 s16, s4
; CHECK-LV-NEXT: vmovx.f16 s4, s8
; CHECK-LV-NEXT: vmovx.f16 s13, s19
; CHECK-LV-NEXT: vins.f16 s19, s4
; CHECK-LV-NEXT: vmovx.f16 s4, s11
-; CHECK-LV-NEXT: vmovx.f16 s15, s5
-; CHECK-LV-NEXT: vins.f16 s10, s4
-; CHECK-LV-NEXT: vmovx.f16 s4, s6
; CHECK-LV-NEXT: vins.f16 s12, s18
; CHECK-LV-NEXT: vmov.f32 s20, s17
; CHECK-LV-NEXT: vmovx.f16 s18, s18
-; CHECK-LV-NEXT: vins.f16 s5, s4
+; CHECK-LV-NEXT: vins.f16 s10, s4
+; CHECK-LV-NEXT: vins.f16 s5, s7
; CHECK-LV-NEXT: vins.f16 s13, s9
; CHECK-LV-NEXT: vins.f16 s20, s18
; CHECK-LV-NEXT: vmov.f32 s17, s19
-; CHECK-LV-NEXT: vins.f16 s15, s7
; CHECK-LV-NEXT: vmovx.f16 s9, s9
; CHECK-LV-NEXT: vmov.f32 s21, s8
+; CHECK-LV-NEXT: vstrw.32 q0, [r1, #16]
; CHECK-LV-NEXT: vmov.f32 s18, s10
; CHECK-LV-NEXT: vins.f16 s21, s9
; CHECK-LV-NEXT: vmov.f32 s19, s5
-; CHECK-LV-NEXT: vstrw.32 q0, [r1, #16]
+; CHECK-LV-NEXT: vmov.f32 s23, s6
; CHECK-LV-NEXT: vadd.i16 q1, q4, q3
; CHECK-LV-NEXT: vadd.i16 q1, q1, q5
; CHECK-LV-NEXT: vstrw.32 q1, [r1]
@@ -469,34 +469,34 @@ define void @vld3_v16i16(ptr %src, ptr %dst) {
; CHECK-LIS-NEXT: vmov.f32 s22, s15
; CHECK-LIS-NEXT: vins.f16 s10, s4
; CHECK-LIS-NEXT: vmovx.f16 s4, s4
+; CHECK-LIS-NEXT: vmovx.f16 s11, s5
; CHECK-LIS-NEXT: vins.f16 s22, s4
; CHECK-LIS-NEXT: vmovx.f16 s4, s7
-; CHECK-LIS-NEXT: vmov.f32 s23, s6
-; CHECK-LIS-NEXT: vmovx.f16 s8, s16
-; CHECK-LIS-NEXT: vins.f16 s23, s4
+; CHECK-LIS-NEXT: vins.f16 s11, s7
+; CHECK-LIS-NEXT: vmovx.f16 s7, s6
+; CHECK-LIS-NEXT: vins.f16 s6, s4
; CHECK-LIS-NEXT: vmovx.f16 s4, s17
+; CHECK-LIS-NEXT: vmovx.f16 s8, s16
; CHECK-LIS-NEXT: vins.f16 s16, s4
; CHECK-LIS-NEXT: vmovx.f16 s4, s12
; CHECK-LIS-NEXT: vmovx.f16 s9, s19
; CHECK-LIS-NEXT: vins.f16 s19, s4
; CHECK-LIS-NEXT: vmovx.f16 s4, s15
-; CHECK-LIS-NEXT: vmovx.f16 s11, s5
-; CHECK-LIS-NEXT: vins.f16 s14, s4
-; CHECK-LIS-NEXT: vmovx.f16 s4, s6
; CHECK-LIS-NEXT: vins.f16 s8, s18
; CHECK-LIS-NEXT: vmov.f32 s20, s17
; CHECK-LIS-NEXT: vmovx.f16 s18, s18
-; CHECK-LIS-NEXT: vins.f16 s5, s4
+; CHECK-LIS-NEXT: vins.f16 s14, s4
+; CHECK-LIS-NEXT: vins.f16 s5, s7
; CHECK-LIS-NEXT: vins.f16 s9, s13
; CHECK-LIS-NEXT: vins.f16 s20, s18
; CHECK-LIS-NEXT: vmov.f32 s17, s19
-; CHECK-LIS-NEXT: vins.f16 s11, s7
; CHECK-LIS-NEXT: vmovx.f16 s13, s13
; CHECK-LIS-NEXT: vmov.f32 s21, s12
+; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16]
; CHECK-LIS-NEXT: vmov.f32 s18, s14
; CHECK-LIS-NEXT: vins.f16 s21, s13
; CHECK-LIS-NEXT: vmov.f32 s19, s5
-; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16]
+; CHECK-LIS-NEXT: vmov.f32 s23, s6
; CHECK-LIS-NEXT: vadd.i16 q1, q4, q2
; CHECK-LIS-NEXT: vadd.i16 q1, q1, q5
; CHECK-LIS-NEXT: vstrw.32 q1, [r1]
@@ -842,49 +842,51 @@ define void @vld3_v4i64(ptr %src, ptr %dst) {
; CHECK-LV-NEXT: vldrw.u32 q0, [r0]
; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32]
; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #16]
-; CHECK-LV-NEXT: vldrw.u32 q5, [r0, #48]
+; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #48]
; CHECK-LV-NEXT: vmov.f32 s4, s2
-; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #64]
+; CHECK-LV-NEXT: vldrw.u32 q5, [r0, #64]
; CHECK-LV-NEXT: vmov.f32 s5, s3
; CHECK-LV-NEXT: vmov.f32 s2, s12
; CHECK-LV-NEXT: vmov.f32 s3, s13
-; CHECK-LV-NEXT: vmov r5, r4, d5
-; CHECK-LV-NEXT: vmov r3, r8, d7
+; CHECK-LV-NEXT: vmov r2, r3, d5
+; CHECK-LV-NEXT: vmov r4, r8, d7
; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #80]
-; CHECK-LV-NEXT: vmov.f32 s24, s22
-; CHECK-LV-NEXT: vmov.f32 s25, s23
+; CHECK-LV-NEXT: vmov.f32 s24, s18
+; CHECK-LV-NEXT: vmov.f32 s25, s19
+; CHECK-LV-NEXT: vmov.f32 s6, s22
+; CHECK-LV-NEXT: vmov.f32 s7, s23
; CHECK-LV-NEXT: vmov lr, r12, d1
; CHECK-LV-NEXT: vmov.f32 s2, s12
; CHECK-LV-NEXT: vmov.f32 s3, s13
; CHECK-LV-NEXT: vmov r6, r7, d12
-; CHECK-LV-NEXT: adds.w r0, r5, lr
-; CHECK-LV-NEXT: adc.w r5, r4, r12
-; CHECK-LV-NEXT: adds.w lr, r0, r3
-; CHECK-LV-NEXT: vmov r4, r2, d10
-; CHECK-LV-NEXT: adc.w r12, r5, r8
-; CHECK-LV-NEXT: vmov r5, r0, d8
-; CHECK-LV-NEXT: adds r6, r6, r4
-; CHECK-LV-NEXT: adcs r2, r7
-; CHECK-LV-NEXT: adds r6, r6, r5
-; CHECK-LV-NEXT: adc.w r8, r2, r0
-; CHECK-LV-NEXT: vmov r7, r4, d1
-; CHECK-LV-NEXT: vmov r2, r5, d9
-; CHECK-LV-NEXT: vmov r3, r0, d0
-; CHECK-LV-NEXT: adds r2, r2, r7
-; CHECK-LV-NEXT: adc.w r7, r5, r4
-; CHECK-LV-NEXT: vmov r5, r4, d7
-; CHECK-LV-NEXT: adds r2, r2, r5
-; CHECK-LV-NEXT: adcs r7, r4
-; CHECK-LV-NEXT: vmov r5, r4, d2
-; CHECK-LV-NEXT: vmov q1[2], q1[0], r6, r2
+; CHECK-LV-NEXT: adds.w r0, r2, lr
+; CHECK-LV-NEXT: adc.w r2, r3, r12
+; CHECK-LV-NEXT: adds.w lr, r0, r4
+; CHECK-LV-NEXT: vmov r3, r5, d8
+; CHECK-LV-NEXT: adc.w r12, r2, r8
+; CHECK-LV-NEXT: vmov r2, r0, d10
+; CHECK-LV-NEXT: adds r3, r3, r6
+; CHECK-LV-NEXT: adcs r7, r5
+; CHECK-LV-NEXT: adds r2, r2, r3
+; CHECK-LV-NEXT: adc.w r8, r7, r0
+; CHECK-LV-NEXT: vmov r6, r5, d1
+; CHECK-LV-NEXT: vmov r3, r7, d3
+; CHECK-LV-NEXT: vmov r4, r0, d0
+; CHECK-LV-NEXT: adds r3, r3, r6
+; CHECK-LV-NEXT: adcs r7, r5
+; CHECK-LV-NEXT: vmov r6, r5, d7
+; CHECK-LV-NEXT: adds r3, r3, r6
+; CHECK-LV-NEXT: adcs r7, r5
+; CHECK-LV-NEXT: vmov r6, r5, d2
+; CHECK-LV-NEXT: vmov q1[2], q1[0], r2, r3
; CHECK-LV-NEXT: vmov q1[3], q1[1], r8, r7
; CHECK-LV-NEXT: vstrw.32 q1, [r1, #16]
-; CHECK-LV-NEXT: adds r3, r3, r5
-; CHECK-LV-NEXT: adcs r0, r4
-; CHECK-LV-NEXT: vmov r4, r5, d4
-; CHECK-LV-NEXT: adds r3, r3, r4
-; CHECK-LV-NEXT: vmov q0[2], q0[0], r3, lr
+; CHECK-LV-NEXT: adds r4, r4, r6
; CHECK-LV-NEXT: adcs r0, r5
+; CHECK-LV-NEXT: vmov r5, r6, d4
+; CHECK-LV-NEXT: adds r4, r4, r5
+; CHECK-LV-NEXT: vmov q0[2], q0[0], r4, lr
+; CHECK-LV-NEXT: adcs r0, r6
; CHECK-LV-NEXT: vmov q0[3], q0[1], r0, r12
; CHECK-LV-NEXT: vstrw.32 q0, [r1]
; CHECK-LV-NEXT: vpop {d8, d9, d10, d11, d12}
@@ -899,42 +901,44 @@ define void @vld3_v4i64(ptr %src, ptr %dst) {
; CHECK-LIS-NEXT: vldrw.u32 q0, [r0]
; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #32]
; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #16]
-; CHECK-LIS-NEXT: vldrw.u32 q5, [r0, #48]
+; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #48]
; CHECK-LIS-NEXT: vmov.f32 s4, s2
-; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #64]
+; CHECK-LIS-NEXT: vldrw.u32 q5, [r0, #64]
; CHECK-LIS-NEXT: vmov.f32 s5, s3
; CHECK-LIS-NEXT: vmov.f32 s2, s12
; CHECK-LIS-NEXT: vmov.f32 s3, s13
; CHECK-LIS-NEXT: vmov r5, r4, d5
; CHECK-LIS-NEXT: vmov r3, r8, d7
; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #80]
-; CHECK-LIS-NEXT: vmov.f32 s24, s22
-; CHECK-LIS-NEXT: vmov.f32 s25, s23
+; CHECK-LIS-NEXT: vmov.f32 s24, s18
+; CHECK-LIS-NEXT: vmov.f32 s25, s19
+; CHECK-LIS-NEXT: vmov.f32 s6, s22
+; CHECK-LIS-NEXT: vmov.f32 s7, s23
; CHECK-LIS-NEXT: vmov lr, r12, d1
; CHECK-LIS-NEXT: vmov.f32 s2, s12
; CHECK-LIS-NEXT: vmov.f32 s3, s13
-; CHECK-LIS-NEXT: vmov r7, r6, d12
+; CHECK-LIS-NEXT: vmov r6, r7, d12
; CHECK-LIS-NEXT: adds.w r0, r5, lr
; CHECK-LIS-NEXT: adc.w r5, r4, r12
; CHECK-LIS-NEXT: adds.w lr, r0, r3
-; CHECK-LIS-NEXT: vmov r4, r2, d10
+; CHECK-LIS-NEXT: vmov r4, r2, d8
; CHECK-LIS-NEXT: adc.w r12, r5, r8
-; CHECK-LIS-NEXT: vmov r5, r0, d8
-; CHECK-LIS-NEXT: adds r7, r7, r4
-; CHECK-LIS-NEXT: adcs r2, r6
-; CHECK-LIS-NEXT: adds r7, r7, r5
+; CHECK-LIS-NEXT: vmov r5, r0, d10
+; CHECK-LIS-NEXT: adds r6, r6, r4
+; CHECK-LIS-NEXT: adcs r2, r7
+; CHECK-LIS-NEXT: adds r6, r6, r5
; CHECK-LIS-NEXT: adc.w r8, r2, r0
-; CHECK-LIS-NEXT: vmov r6, r4, d1
-; CHECK-LIS-NEXT: vmov r2, r5, d9
+; CHECK-LIS-NEXT: vmov r7, r4, d1
+; CHECK-LIS-NEXT: vmov r2, r5, d3
; CHECK-LIS-NEXT: vmov r3, r0, d0
-; CHECK-LIS-NEXT: adds r2, r2, r6
-; CHECK-LIS-NEXT: adc.w r6, r5, r4
+; CHECK-LIS-NEXT: adds r2, r2, r7
+; CHECK-LIS-NEXT: adc.w r7, r5, r4
; CHECK-LIS-NEXT: vmov r5, r4, d7
; CHECK-LIS-NEXT: adds r2, r2, r5
-; CHECK-LIS-NEXT: adcs r6, r4
+; CHECK-LIS-NEXT: adcs r7, r4
; CHECK-LIS-NEXT: vmov r5, r4, d2
-; CHECK-LIS-NEXT: vmov q1[2], q1[0], r7, r2
-; CHECK-LIS-NEXT: vmov q1[3], q1[1], r8, r6
+; CHECK-LIS-NEXT: vmov q1[2], q1[0], r6, r2
+; CHECK-LIS-NEXT: vmov q1[3], q1[1], r8, r7
; CHECK-LIS-NEXT: vstrw.32 q1, [r1, #16]
; CHECK-LIS-NEXT: adds r3, r3, r5
; CHECK-LIS-NEXT: adcs r0, r4
diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
index bf16c5b7d10ff..ab41069bfa258 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
@@ -661,9 +661,9 @@ define void @vld4_v4i64(ptr %src, ptr %dst) {
; CHECK-NEXT: vmov.f32 s3, s21
; CHECK-NEXT: vmov r3, r2, d11
; CHECK-NEXT: vldrw.u32 q5, [r0, #96]
+; CHECK-NEXT: vmov lr, r12, d9
; CHECK-NEXT: vmov.f32 s0, s26
; CHECK-NEXT: vmov.f32 s1, s27
-; CHECK-NEXT: vmov lr, r12, d9
; CHECK-NEXT: vmov.f32 s12, s6
; CHECK-NEXT: vmov.f32 s13, s7
; CHECK-NEXT: vmov r4, r5, d1
@@ -674,50 +674,50 @@ define void @vld4_v4i64(ptr %src, ptr %dst) {
; CHECK-NEXT: vmov.f32 s7, s29
; CHECK-NEXT: vmov.f32 s10, s20
; CHECK-NEXT: vmov.f32 s11, s21
-; CHECK-NEXT: vmov r0, r6, d1
-; CHECK-NEXT: adds r7, r4, r3
-; CHECK-NEXT: vmov r4, r8, d0
-; CHECK-NEXT: adcs r5, r2
-; CHECK-NEXT: vmov r2, r3, d12
+; CHECK-NEXT: vmov r6, r7, d1
+; CHECK-NEXT: adds r3, r3, r4
+; CHECK-NEXT: adc.w r4, r5, r2
+; CHECK-NEXT: vmov r5, r8, d0
; CHECK-NEXT: vmov.f32 s0, s18
; CHECK-NEXT: vmov.f32 s1, s19
-; CHECK-NEXT: adds.w r0, r0, lr
-; CHECK-NEXT: adc.w r6, r6, r12
-; CHECK-NEXT: adds.w lr, r0, r7
-; CHECK-NEXT: adc.w r12, r6, r5
-; CHECK-NEXT: vmov r6, r5, d0
+; CHECK-NEXT: adds.w r2, r6, lr
+; CHECK-NEXT: vmov r6, r0, d12
+; CHECK-NEXT: adc.w r7, r7, r12
+; CHECK-NEXT: adds.w lr, r2, r3
+; CHECK-NEXT: adc.w r12, r7, r4
+; CHECK-NEXT: vmov r7, r4, d0
; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT: adds r2, r2, r4
-; CHECK-NEXT: vmov r4, r0, d8
-; CHECK-NEXT: adc.w r3, r3, r8
-; CHECK-NEXT: adds r6, r6, r4
-; CHECK-NEXT: adcs r0, r5
-; CHECK-NEXT: adds.w r9, r6, r2
-; CHECK-NEXT: adc.w r8, r0, r3
+; CHECK-NEXT: adds r6, r6, r5
+; CHECK-NEXT: vmov r5, r3, d8
+; CHECK-NEXT: adc.w r0, r0, r8
+; CHECK-NEXT: adds r7, r7, r5
+; CHECK-NEXT: adcs r3, r4
+; CHECK-NEXT: adds.w r9, r7, r6
+; CHECK-NEXT: adc.w r8, r3, r0
; CHECK-NEXT: vmov r5, r4, d15
; CHECK-NEXT: vmov r3, r6, d3
-; CHECK-NEXT: vmov r7, r0, d5
+; CHECK-NEXT: vmov r2, r0, d5
; CHECK-NEXT: adds r3, r3, r5
; CHECK-NEXT: adcs r6, r4
; CHECK-NEXT: vmov r5, r4, d11
-; CHECK-NEXT: adds r5, r5, r7
+; CHECK-NEXT: adds r2, r2, r5
; CHECK-NEXT: adcs r0, r4
-; CHECK-NEXT: adds r3, r3, r5
+; CHECK-NEXT: adds r2, r2, r3
; CHECK-NEXT: adc.w r10, r0, r6
-; CHECK-NEXT: vmov r4, r5, d4
-; CHECK-NEXT: vmov r6, r7, d0
-; CHECK-NEXT: vmov r2, r0, d2
-; CHECK-NEXT: vmov q1[2], q1[0], r9, r3
+; CHECK-NEXT: vmov r3, r4, d4
+; CHECK-NEXT: vmov r5, r6, d0
+; CHECK-NEXT: vmov r7, r0, d2
+; CHECK-NEXT: vmov q1[2], q1[0], r9, r2
; CHECK-NEXT: vmov q1[3], q1[1], r8, r10
; CHECK-NEXT: vstrw.32 q1, [r1, #16]
-; CHECK-NEXT: adds r4, r4, r6
-; CHECK-NEXT: adcs r5, r7
-; CHECK-NEXT: vmov r6, r7, d6
-; CHECK-NEXT: adds r2, r2, r6
-; CHECK-NEXT: adcs r0, r7
-; CHECK-NEXT: adds r2, r2, r4
-; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT: adcs r0, r5
+; CHECK-NEXT: adds r3, r3, r5
+; CHECK-NEXT: adcs r4, r6
+; CHECK-NEXT: vmov r5, r6, d6
+; CHECK-NEXT: adds r5, r5, r7
+; CHECK-NEXT: adcs r0, r6
+; CHECK-NEXT: adds r3, r3, r5
+; CHECK-NEXT: vmov q0[2], q0[0], r3, lr
+; CHECK-NEXT: adcs r0, r4
; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: add sp, #16
diff --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
index c82bc582fa87c..87c37953082c6 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
@@ -20,97 +20,97 @@ define void @vldst4(ptr nocapture readonly %pIn, ptr nocapture %pOut, i32 %numRo
; CHECK-NEXT: add.w lr, r3, r12, lsr #3
; CHECK-NEXT: .LBB0_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrh.u16 q3, [r0, #32]
-; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
+; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
+; CHECK-NEXT: vldrh.u16 q3, [r0, #48]
; CHECK-NEXT: vldrh.u16 q4, [r0], #64
-; CHECK-NEXT: vmovx.f16 s8, s15
-; CHECK-NEXT: vmovx.f16 s10, s13
-; CHECK-NEXT: vldrh.u16 q5, [r0, #-48]
-; CHECK-NEXT: vins.f16 s10, s8
-; CHECK-NEXT: vmovx.f16 s8, s3
-; CHECK-NEXT: vmovx.f16 s11, s1
-; CHECK-NEXT: vmov.f32 s7, s1
-; CHECK-NEXT: vins.f16 s11, s8
-; CHECK-NEXT: vmovx.f16 s1, s19
-; CHECK-NEXT: vmovx.f16 s8, s17
-; CHECK-NEXT: vmov.f32 s6, s13
-; CHECK-NEXT: vmovx.f16 s9, s21
-; CHECK-NEXT: vmov.f32 s4, s17
-; CHECK-NEXT: vins.f16 s21, s23
-; CHECK-NEXT: vins.f16 s8, s1
-; CHECK-NEXT: vmovx.f16 s1, s23
-; CHECK-NEXT: vins.f16 s9, s1
-; CHECK-NEXT: vins.f16 s6, s15
-; CHECK-NEXT: vins.f16 s7, s3
-; CHECK-NEXT: vins.f16 s4, s19
-; CHECK-NEXT: vmov.f32 s5, s21
-; CHECK-NEXT: vmovx.f16 s27, s0
-; CHECK-NEXT: vmul.f16 q1, q1, r2
+; CHECK-NEXT: vmovx.f16 s26, s0
; CHECK-NEXT: vins.f16 s0, s2
; CHECK-NEXT: vmovx.f16 s2, s2
-; CHECK-NEXT: vmul.f16 q2, q2, r2
-; CHECK-NEXT: vmovx.f16 s3, s4
-; CHECK-NEXT: vins.f16 s4, s8
-; CHECK-NEXT: vmovx.f16 s8, s8
+; CHECK-NEXT: vldrh.u16 q5, [r0, #-48]
+; CHECK-NEXT: vmovx.f16 s27, s12
+; CHECK-NEXT: vins.f16 s26, s2
+; CHECK-NEXT: vmovx.f16 s2, s14
+; CHECK-NEXT: vmovx.f16 s8, s3
+; CHECK-NEXT: vmovx.f16 s10, s1
; CHECK-NEXT: vmovx.f16 s24, s16
+; CHECK-NEXT: vins.f16 s10, s8
; CHECK-NEXT: vins.f16 s27, s2
; CHECK-NEXT: vmovx.f16 s2, s18
-; CHECK-NEXT: vins.f16 s3, s8
-; CHECK-NEXT: vmovx.f16 s26, s12
+; CHECK-NEXT: vmovx.f16 s8, s15
+; CHECK-NEXT: vmovx.f16 s11, s13
+; CHECK-NEXT: vmov.f32 s6, s1
+; CHECK-NEXT: vmov.f32 s4, s17
+; CHECK-NEXT: vins.f16 s11, s8
; CHECK-NEXT: vmovx.f16 s25, s20
-; CHECK-NEXT: vins.f16 s20, s22
-; CHECK-NEXT: vmovx.f16 s8, s14
; CHECK-NEXT: vins.f16 s24, s2
; CHECK-NEXT: vmovx.f16 s2, s22
+; CHECK-NEXT: vmovx.f16 s1, s19
+; CHECK-NEXT: vmovx.f16 s8, s17
+; CHECK-NEXT: vins.f16 s20, s22
; CHECK-NEXT: vins.f16 s12, s14
-; CHECK-NEXT: vins.f16 s16, s18
-; CHECK-NEXT: vins.f16 s26, s8
+; CHECK-NEXT: vmov.f32 s7, s13
; CHECK-NEXT: vins.f16 s25, s2
-; CHECK-NEXT: vmov.f32 s18, s12
+; CHECK-NEXT: vins.f16 s4, s19
+; CHECK-NEXT: vmovx.f16 s9, s21
+; CHECK-NEXT: vins.f16 s21, s23
+; CHECK-NEXT: vins.f16 s8, s1
+; CHECK-NEXT: vmovx.f16 s1, s23
+; CHECK-NEXT: vins.f16 s16, s18
+; CHECK-NEXT: vmov.f32 s18, s0
; CHECK-NEXT: vmov.f32 s17, s20
+; CHECK-NEXT: vins.f16 s9, s1
+; CHECK-NEXT: vmov.f32 s19, s12
+; CHECK-NEXT: vins.f16 s7, s15
+; CHECK-NEXT: vmul.f16 q3, q4, r2
; CHECK-NEXT: vmul.f16 q6, q6, r2
-; CHECK-NEXT: vmov.f32 s19, s0
+; CHECK-NEXT: vins.f16 s6, s3
+; CHECK-NEXT: vmov.f32 s5, s21
+; CHECK-NEXT: vmul.f16 q1, q1, r2
+; CHECK-NEXT: vmovx.f16 s2, s12
; CHECK-NEXT: vmovx.f16 s0, s24
-; CHECK-NEXT: vmul.f16 q4, q4, r2
-; CHECK-NEXT: vmovx.f16 s15, s5
-; CHECK-NEXT: vmovx.f16 s2, s16
-; CHECK-NEXT: vmovx.f16 s14, s17
+; CHECK-NEXT: vmul.f16 q2, q2, r2
; CHECK-NEXT: vins.f16 s2, s0
+; CHECK-NEXT: vmovx.f16 s19, s5
; CHECK-NEXT: vmovx.f16 s0, s9
-; CHECK-NEXT: vins.f16 s15, s0
+; CHECK-NEXT: vmovx.f16 s18, s13
+; CHECK-NEXT: vins.f16 s19, s0
; CHECK-NEXT: vmovx.f16 s0, s25
-; CHECK-NEXT: vins.f16 s14, s0
+; CHECK-NEXT: vins.f16 s18, s0
; CHECK-NEXT: vmovx.f16 s23, s6
; CHECK-NEXT: vmovx.f16 s0, s10
-; CHECK-NEXT: vmovx.f16 s22, s18
+; CHECK-NEXT: vmovx.f16 s22, s14
; CHECK-NEXT: vins.f16 s23, s0
; CHECK-NEXT: vmovx.f16 s0, s26
; CHECK-NEXT: vins.f16 s22, s0
; CHECK-NEXT: vmovx.f16 s31, s7
; CHECK-NEXT: vmovx.f16 s0, s11
+; CHECK-NEXT: vmovx.f16 s3, s4
+; CHECK-NEXT: vins.f16 s4, s8
+; CHECK-NEXT: vmovx.f16 s8, s8
; CHECK-NEXT: vins.f16 s6, s10
-; CHECK-NEXT: vins.f16 s18, s26
+; CHECK-NEXT: vins.f16 s14, s26
; CHECK-NEXT: vins.f16 s7, s11
; CHECK-NEXT: vins.f16 s31, s0
-; CHECK-NEXT: vmovx.f16 s30, s19
-; CHECK-NEXT: vins.f16 s19, s27
+; CHECK-NEXT: vmovx.f16 s30, s15
+; CHECK-NEXT: vins.f16 s15, s27
; CHECK-NEXT: vmovx.f16 s0, s27
-; CHECK-NEXT: vins.f16 s16, s24
+; CHECK-NEXT: vins.f16 s12, s24
; CHECK-NEXT: vins.f16 s5, s9
-; CHECK-NEXT: vins.f16 s17, s25
+; CHECK-NEXT: vins.f16 s13, s25
; CHECK-NEXT: vins.f16 s30, s0
+; CHECK-NEXT: vins.f16 s3, s8
; CHECK-NEXT: vmov.f32 s1, s4
-; CHECK-NEXT: vmov.f32 s0, s16
+; CHECK-NEXT: vmov.f32 s0, s12
; CHECK-NEXT: vmov.f32 s21, s6
-; CHECK-NEXT: vmov.f32 s20, s18
+; CHECK-NEXT: vmov.f32 s20, s14
; CHECK-NEXT: vmov.f32 s29, s7
; CHECK-NEXT: vstrh.16 q5, [r1, #32]
-; CHECK-NEXT: vmov.f32 s28, s19
+; CHECK-NEXT: vmov.f32 s28, s15
; CHECK-NEXT: vstrh.16 q7, [r1, #48]
; CHECK-NEXT: vstrh.16 q0, [r1], #64
-; CHECK-NEXT: vmov.f32 s12, s17
-; CHECK-NEXT: vmov.f32 s13, s5
-; CHECK-NEXT: vstrh.16 q3, [r1, #-48]
+; CHECK-NEXT: vmov.f32 s16, s13
+; CHECK-NEXT: vmov.f32 s17, s5
+; CHECK-NEXT: vstrh.16 q4, [r1, #-48]
; CHECK-NEXT: le lr, .LBB0_2
; CHECK-NEXT: .LBB0_3: @ %while.end
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll
index f5a129fa2dc94..fa7b0e4d6bcc8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll
@@ -334,19 +334,19 @@ define void @vst2_v4i64(ptr %src, ptr %dst) {
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
-; CHECK-NEXT: vldrw.u32 q2, [r0]
-; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
-; CHECK-NEXT: vmov.f64 d8, d4
+; CHECK-NEXT: vmov.f64 d8, d2
; CHECK-NEXT: vmov.f64 d9, d0
-; CHECK-NEXT: vmov.f64 d0, d5
+; CHECK-NEXT: vmov.f64 d0, d3
; CHECK-NEXT: vstrw.32 q4, [r1]
-; CHECK-NEXT: vmov.f64 d5, d2
+; CHECK-NEXT: vmov.f64 d3, d4
; CHECK-NEXT: vstrw.32 q0, [r1, #16]
-; CHECK-NEXT: vmov.f64 d4, d6
-; CHECK-NEXT: vmov.f64 d2, d7
-; CHECK-NEXT: vstrw.32 q2, [r1, #32]
-; CHECK-NEXT: vstrw.32 q1, [r1, #48]
+; CHECK-NEXT: vmov.f64 d2, d6
+; CHECK-NEXT: vmov.f64 d4, d7
+; CHECK-NEXT: vstrw.32 q1, [r1, #32]
+; CHECK-NEXT: vstrw.32 q2, [r1, #48]
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr
entry:
@@ -640,19 +640,19 @@ define void @vst2_v4f64(ptr %src, ptr %dst) {
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
-; CHECK-NEXT: vldrw.u32 q2, [r0]
-; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
-; CHECK-NEXT: vmov.f64 d8, d4
+; CHECK-NEXT: vmov.f64 d8, d2
; CHECK-NEXT: vmov.f64 d9, d0
-; CHECK-NEXT: vmov.f64 d0, d5
+; CHECK-NEXT: vmov.f64 d0, d3
; CHECK-NEXT: vstrw.32 q4, [r1]
-; CHECK-NEXT: vmov.f64 d5, d2
+; CHECK-NEXT: vmov.f64 d3, d4
; CHECK-NEXT: vstrw.32 q0, [r1, #16]
-; CHECK-NEXT: vmov.f64 d4, d6
-; CHECK-NEXT: vmov.f64 d2, d7
-; CHECK-NEXT: vstrw.32 q2, [r1, #32]
-; CHECK-NEXT: vstrw.32 q1, [r1, #48]
+; CHECK-NEXT: vmov.f64 d2, d6
+; CHECK-NEXT: vmov.f64 d4, d7
+; CHECK-NEXT: vstrw.32 q1, [r1, #32]
+; CHECK-NEXT: vstrw.32 q2, [r1, #48]
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr
entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index 903069b7e776d..ff416dbe3f1a0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -137,105 +137,109 @@ define void @vst3_v16i32(ptr %src, ptr %dst) {
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #160
-; CHECK-NEXT: sub sp, #160
-; CHECK-NEXT: vldrw.u32 q3, [r0, #160]
+; CHECK-NEXT: .pad #176
+; CHECK-NEXT: sub sp, #176
+; CHECK-NEXT: vldrw.u32 q1, [r0, #160]
+; CHECK-NEXT: vldrw.u32 q3, [r0, #96]
; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
; CHECK-NEXT: vldrw.u32 q5, [r0, #128]
-; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: vstrw.32 q3, [sp, #144] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q3, [r0, #144]
-; CHECK-NEXT: vmov r12, r3, d10
-; CHECK-NEXT: vldrw.u32 q7, [r0, #176]
-; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q3, [r0, #96]
-; CHECK-NEXT: vldrw.u32 q6, [r0, #32]
-; CHECK-NEXT: vmov.f32 s8, s1
+; CHECK-NEXT: vstrw.32 q1, [sp, #160] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q1, [r0, #144]
; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q3, [r0, #80]
-; CHECK-NEXT: vmov.f32 s10, s6
-; CHECK-NEXT: vldrw.u32 q4, [r0, #112]
-; CHECK-NEXT: vmov.f32 s11, s2
-; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q2, [r0]
+; CHECK-NEXT: vldrw.u32 q7, [r0, #176]
+; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT: vstrw.32 q3, [sp, #96] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT: vldrw.u32 q1, [r0, #112]
; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
-; CHECK-NEXT: vmov.32 q2[1], r3
-; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q6, [r0, #16]
-; CHECK-NEXT: vstrw.32 q2, [r1, #16]
+; CHECK-NEXT: vmov r12, r3, d10
; CHECK-NEXT: vmov.f32 s20, s22
+; CHECK-NEXT: vstrw.32 q4, [sp] @ 16-byte Spill
; CHECK-NEXT: vmov.f32 s22, s3
+; CHECK-NEXT: vldrw.u32 q4, [r0, #16]
+; CHECK-NEXT: vmov.f32 s21, s11
; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT: vmov.f32 s9, s0
+; CHECK-NEXT: vstrw.32 q5, [r1, #32]
+; CHECK-NEXT: vmov.f32 s24, s1
+; CHECK-NEXT: vmov.f32 s27, s2
+; CHECK-NEXT: vstrw.32 q4, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT: vmov.f32 s21, s0
; CHECK-NEXT: vmov.f32 s0, s30
-; CHECK-NEXT: vmov.f32 s1, s15
-; CHECK-NEXT: vmov.f32 s2, s19
; CHECK-NEXT: vmov.f32 s3, s31
-; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill
-; CHECK-NEXT: vmov.f32 s8, s4
-; CHECK-NEXT: vmov.f32 s11, s5
-; CHECK-NEXT: vmov.f32 s0, s17
-; CHECK-NEXT: vstrw.32 q2, [sp, #128] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s1, s15
+; CHECK-NEXT: vmov.f32 s2, s7
+; CHECK-NEXT: vstrw.32 q0, [sp, #128] @ 16-byte Spill
+; CHECK-NEXT: vmov.f32 s0, s5
; CHECK-NEXT: vmov.f32 s2, s14
+; CHECK-NEXT: vmov.f32 s3, s6
+; CHECK-NEXT: vmov.f32 s26, s10
+; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q0, [sp, #160] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s20, s8
+; CHECK-NEXT: vmov.f32 s23, s9
; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: vmov.f32 s3, s18
-; CHECK-NEXT: vmov.f32 s21, s7
-; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload
-; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload
-; CHECK-NEXT: vstrw.32 q5, [r1, #32]
-; CHECK-NEXT: vmov.f32 s21, s7
-; CHECK-NEXT: vmov.f32 s20, s2
-; CHECK-NEXT: vmov.f32 s23, s3
-; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT: vmov.f32 s22, s11
-; CHECK-NEXT: vstrw.32 q5, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT: vmov.f32 s21, s16
-; CHECK-NEXT: vmov.f32 s23, s13
-; CHECK-NEXT: vmov.f32 s16, s9
-; CHECK-NEXT: vmov.f32 s19, s10
-; CHECK-NEXT: vmov.f32 s13, s8
-; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT: vmov.f32 s18, s6
-; CHECK-NEXT: vmov.f64 d14, d4
-; CHECK-NEXT: vmov.f32 s15, s5
-; CHECK-NEXT: vmov.f32 s5, s27
-; CHECK-NEXT: vmov.f32 s8, s24
-; CHECK-NEXT: vmov.f32 s6, s3
-; CHECK-NEXT: vmov.f32 s9, s0
-; CHECK-NEXT: vmov.f32 s24, s1
-; CHECK-NEXT: vmov.f32 s27, s2
-; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s16, s2
+; CHECK-NEXT: vmov.32 q6[1], r3
+; CHECK-NEXT: vmov.f32 s19, s3
+; CHECK-NEXT: vstrw.32 q6, [r1, #16]
+; CHECK-NEXT: vmov.f32 s17, s31
+; CHECK-NEXT: vstrw.32 q5, [sp, #144] @ 16-byte Spill
+; CHECK-NEXT: vmov.f32 s18, s11
+; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q4, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT: vmov.f64 d8, d14
+; CHECK-NEXT: vmov.f32 s25, s4
+; CHECK-NEXT: vldrw.u32 q1, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s20, s9
+; CHECK-NEXT: vmov.f32 s23, s10
+; CHECK-NEXT: vmov.f32 s17, s8
+; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s27, s13
+; CHECK-NEXT: vmov.f32 s13, s11
+; CHECK-NEXT: vmov.f32 s11, s6
+; CHECK-NEXT: vmov.f32 s19, s29
+; CHECK-NEXT: vmov.f64 d14, d0
+; CHECK-NEXT: vmov.f32 s15, s3
+; CHECK-NEXT: vmov.f32 s0, s8
+; CHECK-NEXT: vmov.f32 s1, s4
+; CHECK-NEXT: vmov.f32 s3, s9
; CHECK-NEXT: vmov r0, r3, d14
+; CHECK-NEXT: vmov.f32 s24, s12
+; CHECK-NEXT: vmov.f32 s12, s2
+; CHECK-NEXT: vmov.32 q0[2], r0
+; CHECK-NEXT: vstrw.32 q0, [r1, #48]
+; CHECK-NEXT: vmov.f64 d1, d5
+; CHECK-NEXT: vmov.f32 s0, s5
+; CHECK-NEXT: vmov.32 q0[1], r3
+; CHECK-NEXT: vmov.f32 s18, s30
+; CHECK-NEXT: vstrw.32 q0, [r1, #64]
+; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s22, s30
; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT: vmov.f32 s7, s11
; CHECK-NEXT: vstrw.32 q0, [r1, #128]
-; CHECK-NEXT: vmov.f32 s11, s25
-; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT: vmov.f32 s20, s12
-; CHECK-NEXT: vmov.32 q6[1], r3
-; CHECK-NEXT: vmov.f32 s12, s4
-; CHECK-NEXT: vstrw.32 q6, [r1, #64]
-; CHECK-NEXT: vmov.f32 s4, s10
-; CHECK-NEXT: vmov.32 q2[2], r0
+; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload
; CHECK-NEXT: vmov r0, lr, d14
-; CHECK-NEXT: vldrw.u32 q7, [sp, #144] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q7, [sp, #160] @ 16-byte Reload
; CHECK-NEXT: vmov.32 q0[1], lr
-; CHECK-NEXT: vmov.32 q5[2], r0
+; CHECK-NEXT: vmov.f32 s14, s7
; CHECK-NEXT: vstrw.32 q0, [r1, #160]
-; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload
; CHECK-NEXT: vmov r2, r4, d14
-; CHECK-NEXT: vstrw.32 q2, [r1, #48]
+; CHECK-NEXT: vmov.32 q6[2], r0
; CHECK-NEXT: vstrw.32 q0, [r1, #176]
-; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT: vmov.32 q3[2], r2
-; CHECK-NEXT: vmov.32 q4[1], r4
+; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload
+; CHECK-NEXT: vmov.32 q4[2], r2
+; CHECK-NEXT: vmov.32 q5[1], r4
; CHECK-NEXT: vmov.32 q0[2], r12
-; CHECK-NEXT: vstrw.32 q1, [r1, #80]
-; CHECK-NEXT: vstrw.32 q3, [r1, #96]
-; CHECK-NEXT: vstrw.32 q4, [r1, #112]
-; CHECK-NEXT: vstrw.32 q5, [r1, #144]
+; CHECK-NEXT: vstrw.32 q3, [r1, #80]
+; CHECK-NEXT: vstrw.32 q4, [r1, #96]
+; CHECK-NEXT: vstrw.32 q5, [r1, #112]
+; CHECK-NEXT: vstrw.32 q6, [r1, #144]
; CHECK-NEXT: vstrw.32 q0, [r1]
-; CHECK-NEXT: add sp, #160
+; CHECK-NEXT: add sp, #176
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: pop {r4, pc}
entry:
@@ -402,10 +406,10 @@ define void @vst3_v16i16(ptr %src, ptr %dst) {
; CHECK-NEXT: .pad #48
; CHECK-NEXT: sub sp, #48
; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
-; CHECK-NEXT: vldrw.u32 q7, [r0]
+; CHECK-NEXT: vldrw.u32 q6, [r0]
; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q6, [r0, #80]
-; CHECK-NEXT: vmov.f32 s0, s31
+; CHECK-NEXT: vldrw.u32 q7, [r0, #80]
+; CHECK-NEXT: vmov.f32 s0, s27
; CHECK-NEXT: vmov.u16 r2, q2[5]
; CHECK-NEXT: vmov.16 q1[0], r2
; CHECK-NEXT: vins.f16 s0, s11
@@ -418,7 +422,7 @@ define void @vst3_v16i16(ptr %src, ptr %dst) {
; CHECK-NEXT: vmovx.f16 s0, s10
; CHECK-NEXT: vmov.f32 s6, s11
; CHECK-NEXT: vins.f16 s4, s0
-; CHECK-NEXT: vmovx.f16 s0, s31
+; CHECK-NEXT: vmovx.f16 s0, s27
; CHECK-NEXT: vins.f16 s6, s0
; CHECK-NEXT: vmovx.f16 s11, s11
; CHECK-NEXT: vmov q0, q1
@@ -430,23 +434,23 @@ define void @vst3_v16i16(ptr %src, ptr %dst) {
; CHECK-NEXT: vmov.16 q4[0], r2
; CHECK-NEXT: vmov.u16 r2, q1[7]
; CHECK-NEXT: vmov.f32 s17, s20
-; CHECK-NEXT: vmovx.f16 s20, s27
+; CHECK-NEXT: vmovx.f16 s20, s31
; CHECK-NEXT: vmov.16 q4[6], r2
; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
; CHECK-NEXT: vins.f16 s19, s20
-; CHECK-NEXT: vmov.f64 d10, d14
+; CHECK-NEXT: vmov.f64 d10, d12
; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT: vins.f16 s27, s15
-; CHECK-NEXT: vmovx.f16 s18, s26
-; CHECK-NEXT: vins.f16 s28, s0
+; CHECK-NEXT: vins.f16 s31, s15
+; CHECK-NEXT: vmovx.f16 s18, s30
+; CHECK-NEXT: vins.f16 s24, s0
; CHECK-NEXT: vmov.u16 r0, q0[1]
-; CHECK-NEXT: vmov.f32 s22, s30
-; CHECK-NEXT: vmov.16 q7[4], r0
+; CHECK-NEXT: vmov.f32 s22, s26
+; CHECK-NEXT: vmov.16 q6[4], r0
; CHECK-NEXT: vmovx.f16 s15, s20
; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill
; CHECK-NEXT: vmovx.f16 s20, s8
; CHECK-NEXT: vmov.u16 r0, q1[1]
-; CHECK-NEXT: vins.f16 s30, s20
+; CHECK-NEXT: vins.f16 s26, s20
; CHECK-NEXT: vmov.f32 s20, s12
; CHECK-NEXT: vins.f16 s20, s4
; CHECK-NEXT: vmov.f32 s11, s21
@@ -456,24 +460,24 @@ define void @vst3_v16i16(ptr %src, ptr %dst) {
; CHECK-NEXT: vmov.16 q0[2], r0
; CHECK-NEXT: vmovx.f16 s12, s12
; CHECK-NEXT: vins.f16 s8, s15
-; CHECK-NEXT: vmov.f32 s31, s11
-; CHECK-NEXT: vmovx.f16 s11, s24
-; CHECK-NEXT: vins.f16 s24, s12
+; CHECK-NEXT: vmov.f32 s27, s11
+; CHECK-NEXT: vmovx.f16 s11, s28
+; CHECK-NEXT: vins.f16 s28, s12
; CHECK-NEXT: vmov.f32 s12, s14
-; CHECK-NEXT: vmovx.f16 s2, s25
-; CHECK-NEXT: vmov.f32 s29, s8
+; CHECK-NEXT: vmovx.f16 s2, s29
+; CHECK-NEXT: vmov.f32 s25, s8
; CHECK-NEXT: vmov.f32 s8, s13
; CHECK-NEXT: vins.f16 s1, s2
; CHECK-NEXT: vmovx.f16 s2, s14
; CHECK-NEXT: vins.f16 s12, s6
; CHECK-NEXT: vins.f16 s8, s5
-; CHECK-NEXT: vins.f16 s26, s2
+; CHECK-NEXT: vins.f16 s30, s2
; CHECK-NEXT: vmovx.f16 s0, s13
; CHECK-NEXT: vmov.f32 s2, s12
; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload
; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload
; CHECK-NEXT: vmov.f32 s23, s8
-; CHECK-NEXT: vins.f16 s25, s0
+; CHECK-NEXT: vins.f16 s29, s0
; CHECK-NEXT: vmov.f32 s8, s6
; CHECK-NEXT: vmov.u16 r0, q3[3]
; CHECK-NEXT: vins.f16 s8, s14
@@ -486,18 +490,18 @@ define void @vst3_v16i16(ptr %src, ptr %dst) {
; CHECK-NEXT: vins.f16 s10, s14
; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload
; CHECK-NEXT: vins.f16 s16, s18
-; CHECK-NEXT: vmov.f32 s18, s27
+; CHECK-NEXT: vmov.f32 s18, s31
; CHECK-NEXT: vins.f16 s22, s11
-; CHECK-NEXT: vmov.f32 s21, s24
+; CHECK-NEXT: vmov.f32 s21, s28
; CHECK-NEXT: vstrw.32 q1, [r1, #32]
-; CHECK-NEXT: vmov.f32 s0, s25
+; CHECK-NEXT: vmov.f32 s0, s29
; CHECK-NEXT: vstrw.32 q5, [r1, #48]
-; CHECK-NEXT: vmov.f32 s3, s26
+; CHECK-NEXT: vmov.f32 s3, s30
; CHECK-NEXT: vstrw.32 q4, [r1, #80]
; CHECK-NEXT: vmov.f32 s12, s9
; CHECK-NEXT: vstrw.32 q0, [r1, #64]
; CHECK-NEXT: vmov.f32 s14, s8
-; CHECK-NEXT: vstrw.32 q7, [r1]
+; CHECK-NEXT: vstrw.32 q6, [r1]
; CHECK-NEXT: vmov.f32 s15, s10
; CHECK-NEXT: vstrw.32 q3, [r1, #16]
; CHECK-NEXT: add sp, #48
@@ -894,28 +898,28 @@ define void @vst3_v4i64(ptr %src, ptr %dst) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: vldrw.u32 q7, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q6, [r0, #32]
+; CHECK-NEXT: vldrw.u32 q6, [r0, #48]
+; CHECK-NEXT: vldrw.u32 q7, [r0, #32]
; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
-; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: vmov.f64 d6, d15
-; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT: vldrw.u32 q2, [r0]
+; CHECK-NEXT: vmov.f64 d2, d13
+; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
; CHECK-NEXT: vldrw.u32 q4, [r0, #64]
-; CHECK-NEXT: vmov.f64 d15, d13
-; CHECK-NEXT: vmov.f64 d7, d1
-; CHECK-NEXT: vmov.f64 d10, d2
-; CHECK-NEXT: vstrw.32 q3, [r1, #80]
-; CHECK-NEXT: vmov.f64 d11, d12
-; CHECK-NEXT: vmov.f64 d2, d8
+; CHECK-NEXT: vmov.f64 d13, d15
+; CHECK-NEXT: vmov.f64 d3, d1
+; CHECK-NEXT: vmov.f64 d10, d4
+; CHECK-NEXT: vstrw.32 q1, [r1, #80]
+; CHECK-NEXT: vmov.f64 d11, d14
+; CHECK-NEXT: vmov.f64 d4, d8
; CHECK-NEXT: vstrw.32 q5, [r1]
-; CHECK-NEXT: vmov.f64 d1, d5
-; CHECK-NEXT: vstrw.32 q1, [r1, #16]
-; CHECK-NEXT: vmov.f64 d8, d15
+; CHECK-NEXT: vmov.f64 d1, d7
+; CHECK-NEXT: vstrw.32 q2, [r1, #16]
+; CHECK-NEXT: vmov.f64 d8, d13
; CHECK-NEXT: vstrw.32 q0, [r1, #64]
-; CHECK-NEXT: vmov.f64 d12, d4
+; CHECK-NEXT: vmov.f64 d14, d6
; CHECK-NEXT: vstrw.32 q4, [r1, #32]
-; CHECK-NEXT: vmov.f64 d13, d14
-; CHECK-NEXT: vstrw.32 q6, [r1, #48]
+; CHECK-NEXT: vmov.f64 d15, d12
+; CHECK-NEXT: vstrw.32 q7, [r1, #48]
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: bx lr
entry:
@@ -1003,39 +1007,39 @@ define void @vst3_v8f32(ptr %src, ptr %dst) {
; CHECK-NEXT: .pad #32
; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
-; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
; CHECK-NEXT: vmov.f32 s0, s2
; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: vmov.f32 s1, s15
-; CHECK-NEXT: vmov.f32 s2, s11
+; CHECK-NEXT: vmov.f32 s1, s11
+; CHECK-NEXT: vmov.f32 s2, s15
; CHECK-NEXT: vldrw.u32 q7, [r0, #64]
; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT: vmov.f32 s0, s12
-; CHECK-NEXT: vmov.f32 s1, s8
-; CHECK-NEXT: vmov.f32 s3, s13
+; CHECK-NEXT: vmov.f32 s0, s8
+; CHECK-NEXT: vmov.f32 s1, s12
+; CHECK-NEXT: vmov.f32 s3, s9
; CHECK-NEXT: vmov.f32 s2, s24
; CHECK-NEXT: vstrw.32 q0, [r1, #48]
; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
; CHECK-NEXT: vmov.f32 s20, s4
; CHECK-NEXT: vmov.f32 s23, s5
; CHECK-NEXT: vstrw.32 q0, [r1, #80]
-; CHECK-NEXT: vmov.f32 s12, s9
-; CHECK-NEXT: vmov.f32 s15, s10
-; CHECK-NEXT: vmov.f32 s13, s25
-; CHECK-NEXT: vmov.f32 s9, s7
-; CHECK-NEXT: vstrw.32 q3, [r1, #64]
+; CHECK-NEXT: vmov.f32 s8, s13
+; CHECK-NEXT: vmov.f32 s11, s14
+; CHECK-NEXT: vmov.f32 s9, s25
+; CHECK-NEXT: vmov.f32 s13, s7
+; CHECK-NEXT: vstrw.32 q2, [r1, #64]
; CHECK-NEXT: vmov.f32 s21, s16
; CHECK-NEXT: vmov.f32 s22, s28
-; CHECK-NEXT: vmov.f32 s8, s30
+; CHECK-NEXT: vmov.f32 s12, s30
; CHECK-NEXT: vstrw.32 q5, [r1]
-; CHECK-NEXT: vmov.f32 s10, s19
-; CHECK-NEXT: vmov.f32 s11, s31
+; CHECK-NEXT: vmov.f32 s14, s19
+; CHECK-NEXT: vmov.f32 s15, s31
; CHECK-NEXT: vmov.f32 s5, s29
-; CHECK-NEXT: vstrw.32 q2, [r1, #32]
+; CHECK-NEXT: vstrw.32 q3, [r1, #32]
; CHECK-NEXT: vmov.f32 s4, s17
; CHECK-NEXT: vmov.f32 s7, s18
; CHECK-NEXT: vstrw.32 q1, [r1, #16]
@@ -1254,58 +1258,58 @@ entry:
define void @vst3_v8f16(ptr %src, ptr %dst) {
; CHECK-LABEL: vst3_v8f16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9, d10, d11}
-; CHECK-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-NEXT: vldrw.u32 q5, [r0, #16]
+; CHECK-NEXT: .vsave {d8, d9}
+; CHECK-NEXT: vpush {d8, d9}
+; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: vmovx.f16 s0, s22
+; CHECK-NEXT: vmovx.f16 s0, s14
; CHECK-NEXT: vmov.f32 s8, s7
; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vins.f16 s8, s23
+; CHECK-NEXT: vins.f16 s8, s15
; CHECK-NEXT: vmov.16 q0[0], r2
; CHECK-NEXT: vmov.f32 s16, s4
; CHECK-NEXT: vmov.f32 s1, s8
-; CHECK-NEXT: vmovx.f16 s8, s23
+; CHECK-NEXT: vmovx.f16 s8, s15
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
; CHECK-NEXT: vmov.16 q0[6], r2
-; CHECK-NEXT: vins.f16 s16, s20
+; CHECK-NEXT: vins.f16 s16, s12
; CHECK-NEXT: vmovx.f16 s2, s10
-; CHECK-NEXT: vmovx.f16 s12, s11
; CHECK-NEXT: vins.f16 s0, s2
; CHECK-NEXT: vmovx.f16 s2, s7
+; CHECK-NEXT: vmovx.f16 s7, s11
; CHECK-NEXT: vins.f16 s11, s2
-; CHECK-NEXT: vmovx.f16 s2, s20
+; CHECK-NEXT: vmovx.f16 s2, s12
+; CHECK-NEXT: vins.f16 s3, s7
; CHECK-NEXT: vmov r0, s2
; CHECK-NEXT: vmovx.f16 s2, s4
; CHECK-NEXT: vmovx.f16 s4, s8
-; CHECK-NEXT: vins.f16 s8, s2
-; CHECK-NEXT: vmovx.f16 s2, s21
; CHECK-NEXT: vmov.16 q4[4], r0
-; CHECK-NEXT: vmov r0, s2
-; CHECK-NEXT: vmovx.f16 s2, s5
-; CHECK-NEXT: vins.f16 s3, s12
-; CHECK-NEXT: vmov.f32 s12, s5
+; CHECK-NEXT: vins.f16 s8, s2
+; CHECK-NEXT: vmovx.f16 s2, s13
+; CHECK-NEXT: vmov.f32 s7, s5
; CHECK-NEXT: vins.f16 s18, s4
; CHECK-NEXT: vmov.f32 s4, s6
-; CHECK-NEXT: vmovx.f16 s14, s9
+; CHECK-NEXT: vins.f16 s7, s13
+; CHECK-NEXT: vmov r0, s2
+; CHECK-NEXT: vins.f16 s4, s14
+; CHECK-NEXT: vmov.16 q3[2], r0
+; CHECK-NEXT: vmovx.f16 s2, s5
+; CHECK-NEXT: vmovx.f16 s12, s9
; CHECK-NEXT: vins.f16 s9, s2
; CHECK-NEXT: vmovx.f16 s2, s6
-; CHECK-NEXT: vins.f16 s12, s21
-; CHECK-NEXT: vins.f16 s4, s22
-; CHECK-NEXT: vmov.16 q5[2], r0
+; CHECK-NEXT: vins.f16 s13, s12
; CHECK-NEXT: vins.f16 s10, s2
; CHECK-NEXT: vmov.f32 s2, s11
-; CHECK-NEXT: vins.f16 s21, s14
-; CHECK-NEXT: vmov.f32 s20, s9
-; CHECK-NEXT: vmov.f32 s22, s4
+; CHECK-NEXT: vmov.f32 s12, s9
; CHECK-NEXT: vstrw.32 q0, [r1, #32]
-; CHECK-NEXT: vmov.f32 s23, s10
+; CHECK-NEXT: vmov.f32 s14, s4
+; CHECK-NEXT: vmov.f32 s15, s10
; CHECK-NEXT: vmov.f32 s17, s8
-; CHECK-NEXT: vstrw.32 q5, [r1, #16]
-; CHECK-NEXT: vmov.f32 s19, s12
+; CHECK-NEXT: vstrw.32 q3, [r1, #16]
+; CHECK-NEXT: vmov.f32 s19, s7
; CHECK-NEXT: vstrw.32 q4, [r1]
-; CHECK-NEXT: vpop {d8, d9, d10, d11}
+; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr
entry:
%l1 = load <8 x half>, ptr %src, align 4
@@ -1329,54 +1333,55 @@ define void @vst3_v16f16(ptr %src, ptr %dst) {
; CHECK-NEXT: sub sp, #72
; CHECK-NEXT: vldrw.u32 q5, [r0, #16]
; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT: vldrw.u32 q4, [r0]
; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
-; CHECK-NEXT: vldrw.u32 q7, [r0, #32]
; CHECK-NEXT: vmov.f32 s0, s20
-; CHECK-NEXT: vldrw.u32 q4, [r0, #64]
+; CHECK-NEXT: vldrw.u32 q6, [r0, #32]
; CHECK-NEXT: vins.f16 s0, s4
; CHECK-NEXT: vmovx.f16 s2, s8
; CHECK-NEXT: vmov.f32 s12, s0
; CHECK-NEXT: vmov.f32 s0, s21
; CHECK-NEXT: vins.f16 s0, s5
+; CHECK-NEXT: vmov.f64 d15, d9
; CHECK-NEXT: vstr s0, [sp, #68] @ 4-byte Spill
; CHECK-NEXT: vmovx.f16 s0, s4
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: vmovx.f16 s0, s20
-; CHECK-NEXT: vmov.16 q3[4], r2
; CHECK-NEXT: vins.f16 s8, s0
+; CHECK-NEXT: vmov.f32 s0, s17
+; CHECK-NEXT: vmov.16 q3[4], r2
+; CHECK-NEXT: vins.f16 s0, s25
; CHECK-NEXT: vins.f16 s14, s2
-; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vstr s0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: vmovx.f16 s0, s24
; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT: vmov.f32 s12, s0
-; CHECK-NEXT: vmov q6, q0
-; CHECK-NEXT: vmovx.f16 s0, s28
-; CHECK-NEXT: vins.f16 s12, s28
+; CHECK-NEXT: vmov.f32 s12, s16
; CHECK-NEXT: vmov r2, s0
+; CHECK-NEXT: vmovx.f16 s0, s16
+; CHECK-NEXT: vmov.f32 s29, s17
+; CHECK-NEXT: vldrw.u32 q4, [r0, #64]
+; CHECK-NEXT: vins.f16 s12, s24
+; CHECK-NEXT: vmov.f32 s4, s23
; CHECK-NEXT: vmovx.f16 s2, s16
; CHECK-NEXT: vmov.16 q3[4], r2
-; CHECK-NEXT: vmovx.f16 s0, s24
; CHECK-NEXT: vins.f16 s14, s2
-; CHECK-NEXT: vmovx.f16 s2, s30
+; CHECK-NEXT: vmovx.f16 s2, s26
; CHECK-NEXT: vins.f16 s16, s0
-; CHECK-NEXT: vmov.f32 s0, s27
+; CHECK-NEXT: vmov.f32 s0, s31
; CHECK-NEXT: vmov r0, s2
; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT: vins.f16 s0, s31
+; CHECK-NEXT: vins.f16 s0, s27
; CHECK-NEXT: vmov.16 q3[0], r0
; CHECK-NEXT: vmov.f32 s13, s0
-; CHECK-NEXT: vmovx.f16 s0, s31
+; CHECK-NEXT: vmovx.f16 s0, s27
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vmovx.f16 s0, s18
; CHECK-NEXT: vmov.16 q3[6], r0
-; CHECK-NEXT: vmov.f32 s4, s1
-; CHECK-NEXT: vins.f16 s12, s0
-; CHECK-NEXT: vmovx.f16 s0, s27
; CHECK-NEXT: vmovx.f16 s2, s19
+; CHECK-NEXT: vins.f16 s12, s0
+; CHECK-NEXT: vmovx.f16 s0, s31
; CHECK-NEXT: vins.f16 s19, s0
-; CHECK-NEXT: vins.f16 s4, s29
; CHECK-NEXT: vmovx.f16 s0, s6
-; CHECK-NEXT: vstr s4, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: vmov.f32 s4, s23
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vins.f16 s15, s2
; CHECK-NEXT: vins.f16 s4, s7
@@ -1386,7 +1391,7 @@ define void @vst3_v16f16(ptr %src, ptr %dst) {
; CHECK-NEXT: vmov r0, s4
; CHECK-NEXT: vmovx.f16 s4, s11
; CHECK-NEXT: vmov.16 q0[6], r0
-; CHECK-NEXT: vstrw.32 q6, [sp, #8] @ 16-byte Spill
+; CHECK-NEXT: vstrw.32 q7, [sp, #8] @ 16-byte Spill
; CHECK-NEXT: vmovx.f16 s2, s10
; CHECK-NEXT: vins.f16 s3, s4
; CHECK-NEXT: vins.f16 s0, s2
@@ -1394,11 +1399,11 @@ define void @vst3_v16f16(ptr %src, ptr %dst) {
; CHECK-NEXT: vins.f16 s11, s2
; CHECK-NEXT: vmov.f32 s2, s22
; CHECK-NEXT: vins.f16 s2, s6
-; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload
; CHECK-NEXT: vstr s2, [sp, #28] @ 4-byte Spill
; CHECK-NEXT: vmovx.f16 s2, s5
; CHECK-NEXT: vmov r0, s2
-; CHECK-NEXT: vldr s27, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: vldr s31, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: vmov.16 q1[2], r0
; CHECK-NEXT: vmov.f32 s2, s11
; CHECK-NEXT: vmovx.f16 s4, s21
@@ -1407,22 +1412,22 @@ define void @vst3_v16f16(ptr %src, ptr %dst) {
; CHECK-NEXT: vmovx.f16 s4, s22
; CHECK-NEXT: vldrw.u32 q5, [sp, #8] @ 16-byte Reload
; CHECK-NEXT: vins.f16 s10, s4
-; CHECK-NEXT: vmovx.f16 s4, s29
+; CHECK-NEXT: vmovx.f16 s4, s25
; CHECK-NEXT: vins.f16 s5, s6
; CHECK-NEXT: vmov.f32 s11, s22
; CHECK-NEXT: vmovx.f16 s6, s17
; CHECK-NEXT: vmov r0, s4
; CHECK-NEXT: vmovx.f16 s4, s21
-; CHECK-NEXT: vins.f16 s11, s30
-; CHECK-NEXT: vmov.16 q7[2], r0
+; CHECK-NEXT: vins.f16 s11, s26
+; CHECK-NEXT: vmov.16 q6[2], r0
; CHECK-NEXT: vins.f16 s17, s4
; CHECK-NEXT: vmovx.f16 s4, s22
; CHECK-NEXT: vldrw.u32 q5, [sp, #48] @ 16-byte Reload
; CHECK-NEXT: vins.f16 s18, s4
-; CHECK-NEXT: vins.f16 s29, s6
+; CHECK-NEXT: vins.f16 s25, s6
; CHECK-NEXT: vldr s23, [sp, #68] @ 4-byte Reload
; CHECK-NEXT: vldr s6, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT: vmov.f32 s25, s16
+; CHECK-NEXT: vmov.f32 s29, s16
; CHECK-NEXT: vmov.f32 s14, s19
; CHECK-NEXT: vstrw.32 q0, [r1, #80]
; CHECK-NEXT: vmov.f32 s21, s8
@@ -1430,12 +1435,12 @@ define void @vst3_v16f16(ptr %src, ptr %dst) {
; CHECK-NEXT: vmov.f32 s4, s9
; CHECK-NEXT: vstrw.32 q5, [r1, #48]
; CHECK-NEXT: vmov.f32 s7, s10
-; CHECK-NEXT: vstrw.32 q6, [r1]
-; CHECK-NEXT: vmov.f32 s28, s17
+; CHECK-NEXT: vstrw.32 q7, [r1]
+; CHECK-NEXT: vmov.f32 s24, s17
; CHECK-NEXT: vstrw.32 q1, [r1, #64]
-; CHECK-NEXT: vmov.f32 s30, s11
-; CHECK-NEXT: vmov.f32 s31, s18
-; CHECK-NEXT: vstrw.32 q7, [r1, #16]
+; CHECK-NEXT: vmov.f32 s26, s11
+; CHECK-NEXT: vmov.f32 s27, s18
+; CHECK-NEXT: vstrw.32 q6, [r1, #16]
; CHECK-NEXT: add sp, #72
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: bx lr
@@ -1486,28 +1491,28 @@ define void @vst3_v4f64(ptr %src, ptr %dst) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: vldrw.u32 q7, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q6, [r0, #32]
+; CHECK-NEXT: vldrw.u32 q6, [r0, #48]
+; CHECK-NEXT: vldrw.u32 q7, [r0, #32]
; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
-; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: vmov.f64 d6, d15
-; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT: vldrw.u32 q2, [r0]
+; CHECK-NEXT: vmov.f64 d2, d13
+; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
; CHECK-NEXT: vldrw.u32 q4, [r0, #64]
-; CHECK-NEXT: vmov.f64 d15, d13
-; CHECK-NEXT: vmov.f64 d7, d1
-; CHECK-NEXT: vmov.f64 d10, d2
-; CHECK-NEXT: vstrw.32 q3, [r1, #80]
-; CHECK-NEXT: vmov.f64 d11, d12
-; CHECK-NEXT: vmov.f64 d2, d8
+; CHECK-NEXT: vmov.f64 d13, d15
+; CHECK-NEXT: vmov.f64 d3, d1
+; CHECK-NEXT: vmov.f64 d10, d4
+; CHECK-NEXT: vstrw.32 q1, [r1, #80]
+; CHECK-NEXT: vmov.f64 d11, d14
+; CHECK-NEXT: vmov.f64 d4, d8
; CHECK-NEXT: vstrw.32 q5, [r1]
-; CHECK-NEXT: vmov.f64 d1, d5
-; CHECK-NEXT: vstrw.32 q1, [r1, #16]
-; CHECK-NEXT: vmov.f64 d8, d15
+; CHECK-NEXT: vmov.f64 d1, d7
+; CHECK-NEXT: vstrw.32 q2, [r1, #16]
+; CHECK-NEXT: vmov.f64 d8, d13
; CHECK-NEXT: vstrw.32 q0, [r1, #64]
-; CHECK-NEXT: vmov.f64 d12, d4
+; CHECK-NEXT: vmov.f64 d14, d6
; CHECK-NEXT: vstrw.32 q4, [r1, #32]
-; CHECK-NEXT: vmov.f64 d13, d14
-; CHECK-NEXT: vstrw.32 q6, [r1, #48]
+; CHECK-NEXT: vmov.f64 d15, d12
+; CHECK-NEXT: vstrw.32 q7, [r1, #48]
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: bx lr
entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll
index 869c9cb7afce8..fab8c311794d3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll
@@ -102,19 +102,19 @@ define ptr @vst4_v2i64(ptr %src, ptr %dst) {
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q3, [r0]
+; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT: vmov.f64 d2, d6
-; CHECK-NEXT: vmov.f64 d3, d0
-; CHECK-NEXT: vmov.f64 d0, d7
-; CHECK-NEXT: vmov.f64 d7, d4
+; CHECK-NEXT: vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT: vmov.f64 d8, d2
+; CHECK-NEXT: vmov.f64 d9, d0
+; CHECK-NEXT: vmov.f64 d0, d3
+; CHECK-NEXT: vmov.f64 d3, d4
; CHECK-NEXT: vstrw.32 q0, [r1, #32]
-; CHECK-NEXT: vmov.f64 d6, d8
-; CHECK-NEXT: vmov.f64 d4, d9
-; CHECK-NEXT: vstrw.32 q3, [r1, #16]
+; CHECK-NEXT: vmov.f64 d2, d6
+; CHECK-NEXT: vmov.f64 d4, d7
+; CHECK-NEXT: vstrw.32 q1, [r1, #16]
; CHECK-NEXT: vstrw.32 q2, [r1, #48]
-; CHECK-NEXT: vstrw.32 q1, [r1], #64
+; CHECK-NEXT: vstrw.32 q4, [r1], #64
; CHECK-NEXT: mov r0, r1
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr
@@ -204,19 +204,19 @@ define ptr @vst4_v2f64(ptr %src, ptr %dst) {
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q3, [r0]
+; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT: vmov.f64 d2, d6
-; CHECK-NEXT: vmov.f64 d3, d0
-; CHECK-NEXT: vmov.f64 d0, d7
-; CHECK-NEXT: vmov.f64 d7, d4
+; CHECK-NEXT: vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT: vmov.f64 d8, d2
+; CHECK-NEXT: vmov.f64 d9, d0
+; CHECK-NEXT: vmov.f64 d0, d3
+; CHECK-NEXT: vmov.f64 d3, d4
; CHECK-NEXT: vstrw.32 q0, [r1, #32]
-; CHECK-NEXT: vmov.f64 d6, d8
-; CHECK-NEXT: vmov.f64 d4, d9
-; CHECK-NEXT: vstrw.32 q3, [r1, #16]
+; CHECK-NEXT: vmov.f64 d2, d6
+; CHECK-NEXT: vmov.f64 d4, d7
+; CHECK-NEXT: vstrw.32 q1, [r1, #16]
; CHECK-NEXT: vstrw.32 q2, [r1, #48]
-; CHECK-NEXT: vstrw.32 q1, [r1], #64
+; CHECK-NEXT: vstrw.32 q4, [r1], #64
; CHECK-NEXT: mov r0, r1
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
index 9dba1cc2d6ed7..d96af49060efd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
@@ -683,19 +683,19 @@ define void @vst4_v2i64(ptr %src, ptr %dst) {
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
-; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
; CHECK-NEXT: vldrw.u32 q3, [r0]
; CHECK-NEXT: vmov.f64 d9, d0
-; CHECK-NEXT: vmov.f64 d8, d4
-; CHECK-NEXT: vmov.f64 d11, d2
+; CHECK-NEXT: vmov.f64 d8, d2
+; CHECK-NEXT: vmov.f64 d11, d4
; CHECK-NEXT: vstrw.32 q4, [r1, #16]
; CHECK-NEXT: vmov.f64 d10, d6
-; CHECK-NEXT: vmov.f64 d0, d5
+; CHECK-NEXT: vmov.f64 d0, d3
; CHECK-NEXT: vstrw.32 q5, [r1]
-; CHECK-NEXT: vmov.f64 d2, d7
+; CHECK-NEXT: vmov.f64 d4, d7
; CHECK-NEXT: vstrw.32 q0, [r1, #48]
-; CHECK-NEXT: vstrw.32 q1, [r1, #32]
+; CHECK-NEXT: vstrw.32 q2, [r1, #32]
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: bx lr
entry:
@@ -721,39 +721,39 @@ define void @vst4_v4i64(ptr %src, ptr %dst) {
; CHECK-NEXT: .pad #64
; CHECK-NEXT: sub sp, #64
; CHECK-NEXT: vldrw.u32 q7, [r0, #80]
-; CHECK-NEXT: vldrw.u32 q5, [r0, #32]
-; CHECK-NEXT: vldrw.u32 q6, [r0]
+; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT: vldrw.u32 q2, [r0]
; CHECK-NEXT: vldrw.u32 q1, [r0, #96]
; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT: vmov.f64 d15, d10
-; CHECK-NEXT: vldrw.u32 q2, [r0, #64]
+; CHECK-NEXT: vmov.f64 d15, d8
+; CHECK-NEXT: vldrw.u32 q6, [r0, #64]
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q4, [r0, #112]
+; CHECK-NEXT: vldrw.u32 q5, [r0, #112]
; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT: vmov.f64 d14, d12
-; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill
; CHECK-NEXT: vmov.f64 d14, d4
+; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill
; CHECK-NEXT: vmov.f64 d15, d2
+; CHECK-NEXT: vmov.f64 d14, d12
; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill
; CHECK-NEXT: vmov.f64 d4, d0
; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload
; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: vmov.f64 d10, d13
-; CHECK-NEXT: vmov.f64 d2, d5
-; CHECK-NEXT: vstrw.32 q5, [r1, #32]
+; CHECK-NEXT: vmov.f64 d8, d5
+; CHECK-NEXT: vmov.f64 d2, d13
+; CHECK-NEXT: vstrw.32 q4, [r1, #32]
; CHECK-NEXT: vmov.f64 d5, d6
; CHECK-NEXT: vstrw.32 q1, [r1, #48]
-; CHECK-NEXT: vmov.f64 d13, d8
+; CHECK-NEXT: vmov.f64 d13, d10
; CHECK-NEXT: vstrw.32 q2, [r1, #64]
; CHECK-NEXT: vmov.f64 d12, d0
-; CHECK-NEXT: vmov.f64 d8, d1
+; CHECK-NEXT: vmov.f64 d10, d1
; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload
; CHECK-NEXT: vstrw.32 q6, [r1, #80]
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
; CHECK-NEXT: vmov.f64 d6, d15
-; CHECK-NEXT: vstrw.32 q4, [r1, #112]
+; CHECK-NEXT: vstrw.32 q5, [r1, #112]
; CHECK-NEXT: vstrw.32 q0, [r1, #16]
; CHECK-NEXT: vstrw.32 q3, [r1, #96]
; CHECK-NEXT: add sp, #64
@@ -1160,53 +1160,53 @@ define void @vst4_v8f16_align1(ptr %src, ptr %dst) {
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT: vldrw.u32 q5, [r0, #48]
+; CHECK-NEXT: vldrw.u32 q4, [r0, #48]
; CHECK-NEXT: vldrw.u32 q2, [r0]
-; CHECK-NEXT: vldrw.u32 q6, [r0, #16]
+; CHECK-NEXT: vldrw.u32 q5, [r0, #16]
; CHECK-NEXT: vmovx.f16 s3, s5
-; CHECK-NEXT: vmovx.f16 s0, s21
+; CHECK-NEXT: vmovx.f16 s0, s17
; CHECK-NEXT: vins.f16 s3, s0
; CHECK-NEXT: vmovx.f16 s2, s9
-; CHECK-NEXT: vmovx.f16 s0, s25
+; CHECK-NEXT: vmovx.f16 s0, s21
; CHECK-NEXT: vmovx.f16 s15, s4
; CHECK-NEXT: vins.f16 s2, s0
-; CHECK-NEXT: vmovx.f16 s0, s20
+; CHECK-NEXT: vmovx.f16 s0, s16
; CHECK-NEXT: vins.f16 s15, s0
; CHECK-NEXT: vmovx.f16 s14, s8
-; CHECK-NEXT: vmovx.f16 s0, s24
-; CHECK-NEXT: vmovx.f16 s19, s7
+; CHECK-NEXT: vmovx.f16 s0, s20
+; CHECK-NEXT: vmovx.f16 s27, s7
; CHECK-NEXT: vins.f16 s14, s0
+; CHECK-NEXT: vmovx.f16 s0, s19
+; CHECK-NEXT: vins.f16 s27, s0
+; CHECK-NEXT: vmovx.f16 s26, s11
; CHECK-NEXT: vmovx.f16 s0, s23
+; CHECK-NEXT: vins.f16 s7, s19
+; CHECK-NEXT: vins.f16 s26, s0
+; CHECK-NEXT: vmovx.f16 s19, s6
+; CHECK-NEXT: vmovx.f16 s0, s18
+; CHECK-NEXT: vins.f16 s5, s17
+; CHECK-NEXT: vins.f16 s6, s18
+; CHECK-NEXT: vins.f16 s4, s16
+; CHECK-NEXT: vins.f16 s8, s20
+; CHECK-NEXT: vins.f16 s11, s23
; CHECK-NEXT: vins.f16 s19, s0
-; CHECK-NEXT: vmovx.f16 s18, s11
-; CHECK-NEXT: vmovx.f16 s0, s27
-; CHECK-NEXT: vins.f16 s7, s23
-; CHECK-NEXT: vins.f16 s18, s0
-; CHECK-NEXT: vmovx.f16 s23, s6
+; CHECK-NEXT: vmovx.f16 s18, s10
+; CHECK-NEXT: vins.f16 s10, s22
; CHECK-NEXT: vmovx.f16 s0, s22
-; CHECK-NEXT: vins.f16 s5, s21
-; CHECK-NEXT: vins.f16 s6, s22
-; CHECK-NEXT: vins.f16 s4, s20
-; CHECK-NEXT: vins.f16 s8, s24
-; CHECK-NEXT: vins.f16 s11, s27
-; CHECK-NEXT: vins.f16 s23, s0
-; CHECK-NEXT: vmovx.f16 s22, s10
-; CHECK-NEXT: vins.f16 s10, s26
-; CHECK-NEXT: vmovx.f16 s0, s26
-; CHECK-NEXT: vins.f16 s9, s25
-; CHECK-NEXT: vins.f16 s22, s0
+; CHECK-NEXT: vins.f16 s9, s21
+; CHECK-NEXT: vins.f16 s18, s0
; CHECK-NEXT: vmov.f32 s0, s9
; CHECK-NEXT: vmov.f32 s1, s5
; CHECK-NEXT: vmov.f32 s13, s4
; CHECK-NEXT: vstrb.8 q0, [r1, #16]
; CHECK-NEXT: vmov.f32 s12, s8
-; CHECK-NEXT: vmov.f32 s17, s7
+; CHECK-NEXT: vmov.f32 s25, s7
; CHECK-NEXT: vstrb.8 q3, [r1]
-; CHECK-NEXT: vmov.f32 s16, s11
-; CHECK-NEXT: vmov.f32 s21, s6
-; CHECK-NEXT: vstrb.8 q4, [r1, #48]
-; CHECK-NEXT: vmov.f32 s20, s10
-; CHECK-NEXT: vstrb.8 q5, [r1, #32]
+; CHECK-NEXT: vmov.f32 s24, s11
+; CHECK-NEXT: vmov.f32 s17, s6
+; CHECK-NEXT: vstrb.8 q6, [r1, #48]
+; CHECK-NEXT: vmov.f32 s16, s10
+; CHECK-NEXT: vstrb.8 q4, [r1, #32]
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: bx lr
entry:
@@ -1232,19 +1232,19 @@ define void @vst4_v2f64(ptr %src, ptr %dst) {
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
-; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
; CHECK-NEXT: vldrw.u32 q3, [r0]
; CHECK-NEXT: vmov.f64 d9, d0
-; CHECK-NEXT: vmov.f64 d8, d4
-; CHECK-NEXT: vmov.f64 d11, d2
+; CHECK-NEXT: vmov.f64 d8, d2
+; CHECK-NEXT: vmov.f64 d11, d4
; CHECK-NEXT: vstrw.32 q4, [r1, #16]
; CHECK-NEXT: vmov.f64 d10, d6
-; CHECK-NEXT: vmov.f64 d0, d5
+; CHECK-NEXT: vmov.f64 d0, d3
; CHECK-NEXT: vstrw.32 q5, [r1]
-; CHECK-NEXT: vmov.f64 d2, d7
+; CHECK-NEXT: vmov.f64 d4, d7
; CHECK-NEXT: vstrw.32 q0, [r1, #48]
-; CHECK-NEXT: vstrw.32 q1, [r1, #32]
+; CHECK-NEXT: vstrw.32 q2, [r1, #32]
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: bx lr
entry:
@@ -1270,39 +1270,39 @@ define void @vst4_v4f64(ptr %src, ptr %dst) {
; CHECK-NEXT: .pad #64
; CHECK-NEXT: sub sp, #64
; CHECK-NEXT: vldrw.u32 q7, [r0, #80]
-; CHECK-NEXT: vldrw.u32 q5, [r0, #32]
-; CHECK-NEXT: vldrw.u32 q6, [r0]
+; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT: vldrw.u32 q2, [r0]
; CHECK-NEXT: vldrw.u32 q1, [r0, #96]
; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT: vmov.f64 d15, d10
-; CHECK-NEXT: vldrw.u32 q2, [r0, #64]
+; CHECK-NEXT: vmov.f64 d15, d8
+; CHECK-NEXT: vldrw.u32 q6, [r0, #64]
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q4, [r0, #112]
+; CHECK-NEXT: vldrw.u32 q5, [r0, #112]
; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT: vmov.f64 d14, d12
-; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill
; CHECK-NEXT: vmov.f64 d14, d4
+; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill
; CHECK-NEXT: vmov.f64 d15, d2
+; CHECK-NEXT: vmov.f64 d14, d12
; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill
; CHECK-NEXT: vmov.f64 d4, d0
; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload
; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: vmov.f64 d10, d13
-; CHECK-NEXT: vmov.f64 d2, d5
-; CHECK-NEXT: vstrw.32 q5, [r1, #32]
+; CHECK-NEXT: vmov.f64 d8, d5
+; CHECK-NEXT: vmov.f64 d2, d13
+; CHECK-NEXT: vstrw.32 q4, [r1, #32]
; CHECK-NEXT: vmov.f64 d5, d6
; CHECK-NEXT: vstrw.32 q1, [r1, #48]
-; CHECK-NEXT: vmov.f64 d13, d8
+; CHECK-NEXT: vmov.f64 d13, d10
; CHECK-NEXT: vstrw.32 q2, [r1, #64]
; CHECK-NEXT: vmov.f64 d12, d0
-; CHECK-NEXT: vmov.f64 d8, d1
+; CHECK-NEXT: vmov.f64 d10, d1
; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload
; CHECK-NEXT: vstrw.32 q6, [r1, #80]
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
; CHECK-NEXT: vmov.f64 d6, d15
-; CHECK-NEXT: vstrw.32 q4, [r1, #112]
+; CHECK-NEXT: vstrw.32 q5, [r1, #112]
; CHECK-NEXT: vstrw.32 q0, [r1, #16]
; CHECK-NEXT: vstrw.32 q3, [r1, #96]
; CHECK-NEXT: add sp, #64
More information about the llvm-commits
mailing list