[llvm] [SDag] Notify listeners when deleting a node (PR #66991)
Sergei Barannikov via llvm-commits
llvm-commits at lists.llvm.org
Sat Apr 26 06:23:51 PDT 2025
https://github.com/s-barannikov updated https://github.com/llvm/llvm-project/pull/66991
>From 688212560cf5e76de53fe2078899d1ccb3b52bc9 Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88 at gmail.com>
Date: Thu, 21 Sep 2023 11:54:39 +0300
Subject: [PATCH] [SDag] Notify listeners when deleting a node
SelectionDAG::DeleteNode does not notify listeners about node deletion.
As a result, SelectionDAG::Legalize may skip legalization of some nodes
resulting in "Legalized selection DAG" containing illegal nodes. These
nodes will be legalized during subsequent DAG combining phase, but this
may be too late for some patterns to match.
---
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 3 +
llvm/test/CodeGen/AMDGPU/half.ll | 146 +-
llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 164 +--
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 201 +--
llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 146 +-
llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 202 +--
.../CodeGen/AMDGPU/memintrinsic-unroll.ll | 1190 ++++++++---------
.../CodeGen/RISCV/rvv/vector-deinterleave.ll | 36 +
llvm/test/CodeGen/X86/pr40730.ll | 16 +-
llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll | 10 +-
.../vector-interleaved-load-i16-stride-5.ll | 56 +-
.../vector-interleaved-load-i8-stride-2.ll | 44 +-
.../vector-interleaved-store-i16-stride-8.ll | 284 ++--
.../CodeGen/X86/vector-shuffle-256-v16.ll | 56 +-
.../zero_extend_vector_inreg_of_broadcast.ll | 8 +-
15 files changed, 1302 insertions(+), 1260 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 9da2ba04f77cb..783d8eac8bf10 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1087,6 +1087,9 @@ void SelectionDAG::RemoveDeadNode(SDNode *N){
}
void SelectionDAG::DeleteNode(SDNode *N) {
+ for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
+ DUL->NodeDeleted(N, nullptr);
+
// First take this out of the appropriate CSE map.
RemoveNodeFromCSEMaps(N);
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index f767511370eee..0ce03df1b4554 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -2317,7 +2317,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: v_mov_b32_e32 v18, s3
; VI-NEXT: v_mov_b32_e32 v17, s2
-; VI-NEXT: s_add_u32 s2, s0, 0x50
+; VI-NEXT: s_add_u32 s2, s0, 0x70
; VI-NEXT: v_mov_b32_e32 v12, s1
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: v_mov_b32_e32 v11, s0
@@ -2331,12 +2331,12 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
; VI-NEXT: v_cvt_f32_f16_e32 v7, v6
; VI-NEXT: v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_cvt_f32_f16_e32 v10, v2
+; VI-NEXT: v_cvt_f32_f16_e32 v10, v0
; VI-NEXT: v_mov_b32_e32 v14, s3
; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; VI-NEXT: v_mov_b32_e32 v13, s2
-; VI-NEXT: s_add_u32 s2, s0, 64
+; VI-NEXT: s_add_u32 s2, s0, 0x60
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: flat_store_dwordx4 v[15:16], v[6:9]
; VI-NEXT: v_mov_b32_e32 v16, s3
@@ -2347,37 +2347,37 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
; VI-NEXT: v_mov_b32_e32 v15, s2
-; VI-NEXT: s_add_u32 s2, s0, 0x70
+; VI-NEXT: s_add_u32 s2, s0, 0x50
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: flat_store_dwordx4 v[17:18], v[4:7]
-; VI-NEXT: v_cvt_f32_f16_sdwa v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT: v_cvt_f32_f16_sdwa v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v8
; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v9
-; VI-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: v_cvt_f32_f16_e32 v2, v1
+; VI-NEXT: v_cvt_f32_f16_sdwa v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT: v_cvt_f32_f16_e32 v0, v3
+; VI-NEXT: s_add_u32 s0, s0, 64
; VI-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
-; VI-NEXT: v_cvt_f32_f16_sdwa v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: v_cvt_f32_f16_e32 v7, v3
-; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v9
-; VI-NEXT: v_cvt_f32_f16_e32 v9, v0
-; VI-NEXT: v_cvt_f64_f32_e32 v[1:2], v2
-; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v10
-; VI-NEXT: v_cvt_f64_f32_e32 v[11:12], v11
-; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9
-; VI-NEXT: s_add_u32 s0, s0, 0x60
-; VI-NEXT: flat_store_dwordx4 v[13:14], v[1:4]
+; VI-NEXT: v_cvt_f32_f16_sdwa v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v8
+; VI-NEXT: v_cvt_f32_f16_e32 v8, v2
+; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT: v_cvt_f32_f16_e32 v7, v1
+; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v0
+; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10
+; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
+; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v2
+; VI-NEXT: flat_store_dwordx4 v[13:14], v[3:6]
; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7
+; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
+; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12
; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17
-; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8
; VI-NEXT: v_mov_b32_e32 v20, s3
-; VI-NEXT: v_mov_b32_e32 v14, s1
+; VI-NEXT: v_mov_b32_e32 v13, s1
; VI-NEXT: v_mov_b32_e32 v19, s2
-; VI-NEXT: v_mov_b32_e32 v13, s0
-; VI-NEXT: flat_store_dwordx4 v[15:16], v[9:12]
-; VI-NEXT: flat_store_dwordx4 v[19:20], v[0:3]
-; VI-NEXT: flat_store_dwordx4 v[13:14], v[5:8]
+; VI-NEXT: v_mov_b32_e32 v12, s0
+; VI-NEXT: flat_store_dwordx4 v[15:16], v[8:11]
+; VI-NEXT: flat_store_dwordx4 v[19:20], v[4:7]
+; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: global_extload_v16f16_to_v16f64:
@@ -2389,40 +2389,40 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v32, s[2:3]
; GFX11-TRUE16-NEXT: global_load_b128 v[4:7], v32, s[2:3] offset:16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v10, v1.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v4
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v15, v7.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v23, v7.l
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v14, v6.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v22, v6.l
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v5
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v10, v1.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v4
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v13, v3.l
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v12, v2.l
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v18, v4.l
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v22, v5.l
-; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v10, v23.l
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v34, v11.l
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v11, v19.l
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v7.l
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v6.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v18, v5.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v34, v11.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v11, v19.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v14, v4.l
+; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v10, v15.l
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v8, v0.l
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v3.l
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
-; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[28:29], v22
-; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[30:31], v10
-; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[24:25], v18
-; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[26:27], v11
+; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[28:29], v23
+; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[30:31], v7
+; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[24:25], v22
+; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[26:27], v6
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v33, v9.l
-; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[20:21], v15
-; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[22:23], v7
+; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[20:21], v18
+; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[22:23], v11
; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[16:17], v14
-; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[18:19], v6
+; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[18:19], v10
; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v8
; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[12:13], v13
@@ -2431,10 +2431,10 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[6:7], v34
; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[2:3], v33
; GFX11-TRUE16-NEXT: s_clause 0x7
-; GFX11-TRUE16-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:80
-; GFX11-TRUE16-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:64
-; GFX11-TRUE16-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:112
-; GFX11-TRUE16-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:96
+; GFX11-TRUE16-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:112
+; GFX11-TRUE16-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:96
+; GFX11-TRUE16-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:80
+; GFX11-TRUE16-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64
; GFX11-TRUE16-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:48
; GFX11-TRUE16-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32
; GFX11-TRUE16-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16
@@ -2450,40 +2450,40 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
; GFX11-FAKE16-NEXT: global_load_b128 v[0:3], v32, s[2:3]
; GFX11-FAKE16-NEXT: global_load_b128 v[4:7], v32, s[2:3] offset:16
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v10, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v1
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v4
-; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v15, v7
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v23, v7
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v14, v6
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v22, v6
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v5
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v10, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v4
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v13, v3
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v12, v2
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v18, v4
-; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v22, v5
-; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
-; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v10, v23
-; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v34, v11
-; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v11, v19
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v7
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v18, v5
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v34, v11
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v11, v19
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v14, v4
+; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v10, v15
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v8, v0
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[28:29], v22
-; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[30:31], v10
-; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[24:25], v18
-; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[26:27], v11
+; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[28:29], v23
+; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[30:31], v7
+; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[24:25], v22
+; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[26:27], v6
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v33, v9
-; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[20:21], v15
-; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[22:23], v7
+; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[20:21], v18
+; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[22:23], v11
; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[16:17], v14
-; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[18:19], v6
+; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[18:19], v10
; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v8
; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[12:13], v13
@@ -2492,10 +2492,10 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[6:7], v34
; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[2:3], v33
; GFX11-FAKE16-NEXT: s_clause 0x7
-; GFX11-FAKE16-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:80
-; GFX11-FAKE16-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:64
-; GFX11-FAKE16-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:112
-; GFX11-FAKE16-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:96
+; GFX11-FAKE16-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:112
+; GFX11-FAKE16-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:96
+; GFX11-FAKE16-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:80
+; GFX11-FAKE16-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64
; GFX11-FAKE16-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:48
; GFX11-FAKE16-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32
; GFX11-FAKE16-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index a9240eff8e691..25160fb6973b1 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -5450,33 +5450,33 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX6-NEXT: s_mov_b32 s0, s4
; GFX6-NEXT: s_mov_b32 s1, s5
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_bfe_u32 v2, v29, 11, 1
-; GFX6-NEXT: v_bfe_u32 v0, v29, 10, 1
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GFX6-NEXT: v_bfe_u32 v5, v29, 9, 1
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 15, v29
+; GFX6-NEXT: v_bfe_u32 v0, v29, 14, 1
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GFX6-NEXT: v_bfe_u32 v5, v29, 13, 1
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_bfe_u32 v3, v29, 8, 1
-; GFX6-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:64
-; GFX6-NEXT: v_lshrrev_b32_e32 v8, 15, v29
+; GFX6-NEXT: v_bfe_u32 v3, v29, 12, 1
+; GFX6-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:96
+; GFX6-NEXT: v_bfe_u32 v8, v29, 11, 1
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_bfe_u32 v6, v29, 14, 1
-; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112
-; GFX6-NEXT: v_bfe_u32 v27, v29, 5, 1
-; GFX6-NEXT: v_bfe_u32 v23, v29, 7, 1
-; GFX6-NEXT: v_bfe_u32 v19, v29, 1, 1
-; GFX6-NEXT: v_bfe_u32 v15, v29, 3, 1
-; GFX6-NEXT: v_bfe_u32 v11, v29, 13, 1
-; GFX6-NEXT: v_bfe_u32 v25, v29, 4, 1
-; GFX6-NEXT: v_bfe_u32 v21, v29, 6, 1
-; GFX6-NEXT: v_and_b32_e32 v17, 1, v29
-; GFX6-NEXT: v_bfe_u32 v13, v29, 2, 1
+; GFX6-NEXT: v_bfe_u32 v6, v29, 10, 1
+; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80
+; GFX6-NEXT: v_bfe_u32 v27, v29, 1, 1
+; GFX6-NEXT: v_bfe_u32 v23, v29, 3, 1
+; GFX6-NEXT: v_bfe_u32 v19, v29, 5, 1
+; GFX6-NEXT: v_bfe_u32 v15, v29, 7, 1
+; GFX6-NEXT: v_bfe_u32 v11, v29, 9, 1
+; GFX6-NEXT: v_and_b32_e32 v25, 1, v29
+; GFX6-NEXT: v_bfe_u32 v21, v29, 2, 1
+; GFX6-NEXT: v_bfe_u32 v17, v29, 4, 1
+; GFX6-NEXT: v_bfe_u32 v13, v29, 6, 1
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_bfe_u32 v9, v29, 12, 1
-; GFX6-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:96
-; GFX6-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:16
-; GFX6-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0
-; GFX6-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:48
-; GFX6-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:32
+; GFX6-NEXT: v_bfe_u32 v9, v29, 8, 1
+; GFX6-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64
+; GFX6-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:48
+; GFX6-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:32
+; GFX6-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:16
+; GFX6-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: constant_zextload_v16i1_to_v16i64:
@@ -5490,12 +5490,10 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v1
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v11, v1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10009
; GFX8-NEXT: s_bfe_u32 s4, s2, 0x1000d
+; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10009
; GFX8-NEXT: s_bfe_u32 s5, s2, 0x10007
; GFX8-NEXT: s_bfe_u32 s6, s2, 0x10003
; GFX8-NEXT: s_bfe_u32 s7, s2, 0x10001
@@ -5503,44 +5501,45 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX8-NEXT: s_bfe_u32 s9, s2, 0x10002
; GFX8-NEXT: s_bfe_u32 s10, s2, 0x10004
; GFX8-NEXT: s_bfe_u32 s11, s2, 0x10006
-; GFX8-NEXT: s_bfe_u32 s12, s2, 0x1000c
-; GFX8-NEXT: s_bfe_u32 s2, s2, 0x1000a
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0x50
+; GFX8-NEXT: s_bfe_u32 s12, s2, 0x1000a
+; GFX8-NEXT: s_bfe_u32 s13, s2, 0x1000c
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v0
+; GFX8-NEXT: s_add_u32 s2, s0, 0x70
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 15, v6
+; GFX8-NEXT: v_bfe_u32 v10, v6, 11, 1
+; GFX8-NEXT: v_bfe_u32 v11, v6, 5, 1
+; GFX8-NEXT: v_bfe_u32 v4, v6, 8, 1
+; GFX8-NEXT: v_bfe_u32 v0, v6, 14, 1
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v13, s3
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 64
-; GFX8-NEXT: v_bfe_u32 v2, v4, 11, 1
+; GFX8-NEXT: v_mov_b32_e32 v9, s3
+; GFX8-NEXT: v_mov_b32_e32 v8, s2
+; GFX8-NEXT: s_add_u32 s2, s0, 0x60
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v13, s3
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0x70
-; GFX8-NEXT: v_lshrrev_b32_e32 v10, 15, v4
-; GFX8-NEXT: v_bfe_u32 v14, v4, 5, 1
-; GFX8-NEXT: v_bfe_u32 v8, v4, 14, 1
-; GFX8-NEXT: v_bfe_u32 v4, v4, 8, 1
+; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v9, s3
+; GFX8-NEXT: v_mov_b32_e32 v8, s2
+; GFX8-NEXT: s_add_u32 s2, s0, 0x50
+; GFX8-NEXT: v_mov_b32_e32 v0, s13
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
+; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v9, s3
+; GFX8-NEXT: v_mov_b32_e32 v8, s2
+; GFX8-NEXT: s_add_u32 s2, s0, 64
; GFX8-NEXT: v_mov_b32_e32 v0, s12
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0x60
+; GFX8-NEXT: v_mov_b32_e32 v2, v10
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[8:11]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v9, s3
+; GFX8-NEXT: v_mov_b32_e32 v8, s2
; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; GFX8-NEXT: v_mov_b32_e32 v0, s11
; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_mov_b32_e32 v0, s11
; GFX8-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -5548,7 +5547,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: s_add_u32 s2, s0, 16
; GFX8-NEXT: v_mov_b32_e32 v0, s10
-; GFX8-NEXT: v_mov_b32_e32 v2, v14
+; GFX8-NEXT: v_mov_b32_e32 v2, v11
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s3
@@ -5656,49 +5655,50 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v0
; GFX12-NEXT: v_readfirstlane_b32 s2, v0
; GFX12-NEXT: v_mov_b32_e32 v7, v1
-; GFX12-NEXT: v_mov_b32_e32 v11, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: v_bfe_u32 v2, v4, 11, 1
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000a
-; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000d
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, 15, v4
+; GFX12-NEXT: v_bfe_u32 v0, v4, 14, 1
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000c
-; GFX12-NEXT: v_mov_b32_e32 v5, v1
+; GFX12-NEXT: v_mov_b32_e32 v3, v1
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000d
+; GFX12-NEXT: v_bfe_u32 v5, v4, 11, 1
; GFX12-NEXT: v_bfe_u32 v6, v4, 5, 1
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80
+; GFX12-NEXT: s_bfe_u32 s5, s2, 0x10002
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s3
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10007
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000a
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10006
-; GFX12-NEXT: v_mov_b32_e32 v9, v1
-; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10002
+; GFX12-NEXT: v_bfe_u32 v4, v4, 8, 1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, v5
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10007
+; GFX12-NEXT: v_mov_b32_e32 v5, v1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80
; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s3
-; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10004
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10009
-; GFX12-NEXT: s_bfe_u32 s5, s2, 0x10001
-; GFX12-NEXT: v_lshrrev_b32_e32 v10, 15, v4
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10004
+; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10003
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, v6
-; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10003
-; GFX12-NEXT: s_and_b32 s2, s2, 1
-; GFX12-NEXT: v_bfe_u32 v8, v4, 14, 1
-; GFX12-NEXT: v_bfe_u32 v4, v4, 8, 1
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
-; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10009
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: v_mov_b32_e32 v6, s3
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10001
+; GFX12-NEXT: s_and_b32 s2, s2, 1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: v_mov_b32_e32 v2, s5
-; GFX12-NEXT: s_clause 0x2
-; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:112
+; GFX12-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:64
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 817c5def5614f..183cbebb342a9 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -6826,27 +6826,27 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s1, 16
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s3, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s7, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s5, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s4, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s6, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s5, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s7, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s6, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s4, 16
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s2, 16
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s0, 16
; GCN-NOHSA-SI-NEXT: s_and_b32 s0, s0, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s2, 0xffff
-; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff
+; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s1, s1, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s3, s3, 0xffff
-; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff
; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s3
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13
@@ -6856,13 +6856,13 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:64
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:64
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18
@@ -6886,33 +6886,33 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_lshr_b32 s12, s5, 16
; GCN-HSA-NEXT: s_lshr_b32 s13, s7, 16
-; GCN-HSA-NEXT: s_lshr_b32 s14, s11, 16
-; GCN-HSA-NEXT: s_lshr_b32 s2, s9, 16
-; GCN-HSA-NEXT: s_lshr_b32 s15, s8, 16
-; GCN-HSA-NEXT: s_lshr_b32 s16, s10, 16
+; GCN-HSA-NEXT: s_lshr_b32 s14, s9, 16
+; GCN-HSA-NEXT: s_lshr_b32 s2, s11, 16
+; GCN-HSA-NEXT: s_lshr_b32 s15, s10, 16
+; GCN-HSA-NEXT: s_lshr_b32 s16, s8, 16
; GCN-HSA-NEXT: s_lshr_b32 s17, s6, 16
; GCN-HSA-NEXT: s_lshr_b32 s18, s4, 16
; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff
; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff
-; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff
; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff
+; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff
; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff
; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff
-; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff
-; GCN-HSA-NEXT: s_and_b32 s3, s9, 0xffff
+; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff
+; GCN-HSA-NEXT: s_and_b32 s3, s11, 0xffff
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -6925,22 +6925,22 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 64
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 64
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -6972,41 +6972,41 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s7, 16
; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s10, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s11, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s8, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s8, 16
; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s9, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s9, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s9, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s10, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s11, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s11, 0xffff
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x50
+; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x70
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3
; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 64
+; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x60
; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x70
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x50
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18
; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x60
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11
+; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 64
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17
; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16
; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -7126,33 +7126,33 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s10, s5, 0xffff
+; GFX12-NEXT: s_and_b32 s10, s7, 0xffff
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s10
-; GFX12-NEXT: s_lshr_b32 s5, s5, 16
+; GFX12-NEXT: s_lshr_b32 s7, s7, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_lshr_b32 s7, s6, 16
+; GFX12-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:112
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: v_mov_b32_e32 v2, s7
+; GFX12-NEXT: s_lshr_b32 s6, s5, 16
+; GFX12-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:96
+; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: s_lshr_b32 s5, s4, 16
; GFX12-NEXT: s_and_b32 s4, s4, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:80
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s5
-; GFX12-NEXT: s_lshr_b32 s4, s7, 16
-; GFX12-NEXT: s_and_b32 s5, s7, 0xffff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:64
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v0, s5
-; GFX12-NEXT: v_mov_b32_e32 v2, s4
-; GFX12-NEXT: s_lshr_b32 s4, s6, 16
-; GFX12-NEXT: s_and_b32 s5, s6, 0xffff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:112
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v0, s5
-; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_lshr_b32 s4, s3, 16
; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:96
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:64
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s4
@@ -7378,57 +7378,58 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s1, 16
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x100000
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[6:7], 0x100000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[12:13], 0x100000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[18:19], 0x100000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x100000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, s3
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s7
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x100000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s5
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[6:7], 0x100000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s7
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s3, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s7, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[4:5], 0x100000
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s5, 16
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x100000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s7, 16
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x50
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s8, 0x70
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s9, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
-; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 64
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7
+; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s8, 0x60
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s9, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
-; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x70
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: s_nop 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x50
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
-; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x60
+; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 64
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
@@ -7568,17 +7569,17 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s30, s9
-; GFX12-NEXT: s_lshr_b32 s34, s9, 16
-; GFX12-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x100000
-; GFX12-NEXT: s_lshr_b32 s8, s8, 16
-; GFX12-NEXT: s_bfe_i64 s[22:23], s[10:11], 0x100000
-; GFX12-NEXT: s_mov_b32 s24, s11
-; GFX12-NEXT: s_lshr_b32 s26, s11, 16
+; GFX12-NEXT: s_mov_b32 s30, s11
+; GFX12-NEXT: s_lshr_b32 s34, s11, 16
+; GFX12-NEXT: s_bfe_i64 s[28:29], s[10:11], 0x100000
+; GFX12-NEXT: s_lshr_b32 s10, s10, 16
+; GFX12-NEXT: s_bfe_i64 s[22:23], s[8:9], 0x100000
+; GFX12-NEXT: s_mov_b32 s24, s9
+; GFX12-NEXT: s_lshr_b32 s26, s9, 16
; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000
; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000
-; GFX12-NEXT: s_lshr_b32 s10, s10, 16
-; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
+; GFX12-NEXT: s_lshr_b32 s8, s8, 16
+; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s29
; GFX12-NEXT: s_mov_b32 s18, s7
; GFX12-NEXT: s_lshr_b32 s20, s7, 16
@@ -7586,30 +7587,30 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v9, s31
; GFX12-NEXT: v_dual_mov_b32 v8, s30 :: v_dual_mov_b32 v11, s35
-; GFX12-NEXT: v_dual_mov_b32 v10, s34 :: v_dual_mov_b32 v3, s9
+; GFX12-NEXT: v_dual_mov_b32 v10, s34 :: v_dual_mov_b32 v3, s11
; GFX12-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x100000
; GFX12-NEXT: s_lshr_b32 s6, s6, 16
-; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v5, s23
-; GFX12-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v13, s25
+; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v13, s25
; GFX12-NEXT: s_mov_b32 s12, s5
; GFX12-NEXT: s_lshr_b32 s14, s5, 16
; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000
; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
-; GFX12-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v7, s11
+; GFX12-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v7, s9
; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000
; GFX12-NEXT: s_lshr_b32 s4, s4, 16
; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
-; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v17, s19
+; GFX12-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v17, s19
; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s21
; GFX12-NEXT: v_mov_b32_e32 v18, s20
; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:64
+; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:112
+; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:96
; GFX12-NEXT: v_dual_mov_b32 v1, s17 :: v_dual_mov_b32 v0, s16
; GFX12-NEXT: v_dual_mov_b32 v3, s7 :: v_dual_mov_b32 v2, s6
; GFX12-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v8, s12
@@ -7617,8 +7618,8 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GFX12-NEXT: v_dual_mov_b32 v21, s3 :: v_dual_mov_b32 v20, s2
; GFX12-NEXT: v_dual_mov_b32 v23, s5 :: v_dual_mov_b32 v22, s4
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:64
; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:48
; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:32
; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 3b0f8523e1b52..af70b4bb90c31 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -6657,53 +6657,53 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, v1
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s5, 24
-; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s4, 24
-; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 24
-; GFX6-NOHSA-NEXT: s_lshr_b32 s11, s6, 24
-; GFX6-NOHSA-NEXT: s_bfe_u32 s12, s6, 0x80008
-; GFX6-NOHSA-NEXT: s_bfe_u32 s13, s7, 0x80008
-; GFX6-NOHSA-NEXT: s_bfe_u32 s14, s4, 0x80008
-; GFX6-NOHSA-NEXT: s_bfe_u32 s15, s5, 0x80008
-; GFX6-NOHSA-NEXT: s_and_b32 s16, s5, 0xff
-; GFX6-NOHSA-NEXT: s_and_b32 s17, s4, 0xff
-; GFX6-NOHSA-NEXT: s_and_b32 s18, s7, 0xff
-; GFX6-NOHSA-NEXT: s_and_b32 s19, s6, 0xff
-; GFX6-NOHSA-NEXT: s_bfe_u32 s5, s5, 0x80010
+; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s4, 24
+; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s5, 24
+; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s6, 24
+; GFX6-NOHSA-NEXT: s_lshr_b32 s11, s7, 24
+; GFX6-NOHSA-NEXT: s_bfe_u32 s12, s7, 0x80008
+; GFX6-NOHSA-NEXT: s_bfe_u32 s13, s6, 0x80008
+; GFX6-NOHSA-NEXT: s_bfe_u32 s14, s5, 0x80008
+; GFX6-NOHSA-NEXT: s_bfe_u32 s15, s4, 0x80008
+; GFX6-NOHSA-NEXT: s_and_b32 s16, s4, 0xff
+; GFX6-NOHSA-NEXT: s_and_b32 s17, s5, 0xff
+; GFX6-NOHSA-NEXT: s_and_b32 s18, s6, 0xff
+; GFX6-NOHSA-NEXT: s_and_b32 s19, s7, 0xff
; GFX6-NOHSA-NEXT: s_bfe_u32 s4, s4, 0x80010
-; GFX6-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010
+; GFX6-NOHSA-NEXT: s_bfe_u32 s5, s5, 0x80010
; GFX6-NOHSA-NEXT: s_bfe_u32 s7, s7, 0x80010
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX6-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s7
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s11
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s7
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s10
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s9
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s8
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s19
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s12
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s18
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s13
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s17
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s14
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s16
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s15
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX6-NOHSA-NEXT: s_endpgm
;
; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i64:
@@ -6717,70 +6717,70 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-HSA-NEXT: s_lshr_b32 s8, s5, 24
-; GFX7-HSA-NEXT: s_lshr_b32 s9, s4, 24
-; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 24
-; GFX7-HSA-NEXT: s_lshr_b32 s2, s6, 24
-; GFX7-HSA-NEXT: s_bfe_u32 s11, s6, 0x80008
-; GFX7-HSA-NEXT: s_bfe_u32 s12, s7, 0x80008
-; GFX7-HSA-NEXT: s_bfe_u32 s13, s4, 0x80008
-; GFX7-HSA-NEXT: s_bfe_u32 s14, s5, 0x80008
-; GFX7-HSA-NEXT: s_and_b32 s15, s5, 0xff
-; GFX7-HSA-NEXT: s_and_b32 s16, s4, 0xff
-; GFX7-HSA-NEXT: s_and_b32 s17, s7, 0xff
-; GFX7-HSA-NEXT: s_and_b32 s18, s6, 0xff
-; GFX7-HSA-NEXT: s_bfe_u32 s5, s5, 0x80010
+; GFX7-HSA-NEXT: s_lshr_b32 s8, s4, 24
+; GFX7-HSA-NEXT: s_lshr_b32 s9, s5, 24
+; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 24
+; GFX7-HSA-NEXT: s_lshr_b32 s2, s7, 24
+; GFX7-HSA-NEXT: s_bfe_u32 s11, s7, 0x80008
+; GFX7-HSA-NEXT: s_bfe_u32 s12, s6, 0x80008
+; GFX7-HSA-NEXT: s_bfe_u32 s13, s5, 0x80008
+; GFX7-HSA-NEXT: s_bfe_u32 s14, s4, 0x80008
+; GFX7-HSA-NEXT: s_and_b32 s15, s4, 0xff
+; GFX7-HSA-NEXT: s_and_b32 s16, s5, 0xff
+; GFX7-HSA-NEXT: s_and_b32 s17, s6, 0xff
+; GFX7-HSA-NEXT: s_and_b32 s18, s7, 0xff
; GFX7-HSA-NEXT: s_bfe_u32 s4, s4, 0x80010
-; GFX7-HSA-NEXT: s_bfe_u32 s7, s7, 0x80010
-; GFX7-HSA-NEXT: s_bfe_u32 s3, s6, 0x80010
+; GFX7-HSA-NEXT: s_bfe_u32 s5, s5, 0x80010
+; GFX7-HSA-NEXT: s_bfe_u32 s6, s6, 0x80010
+; GFX7-HSA-NEXT: s_bfe_u32 s3, s7, 0x80010
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50
+; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x70
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x70
+; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7
+; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX7-HSA-NEXT: s_add_u32 s2, s0, 64
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60
+; GFX7-HSA-NEXT: s_add_u32 s2, s0, 64
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s17
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13
-; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15
@@ -6799,18 +6799,18 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s5, 24
; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s7, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s6, 24
-; GFX8-NOHSA-NEXT: s_bfe_u32 s10, s6, 0x80008
-; GFX8-NOHSA-NEXT: s_bfe_u32 s11, s7, 0x80008
+; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s7, 0x80008
+; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s6, 24
+; GFX8-NOHSA-NEXT: s_bfe_u32 s11, s6, 0x80008
; GFX8-NOHSA-NEXT: s_bfe_u32 s12, s5, 0x80008
; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s4, 24
; GFX8-NOHSA-NEXT: s_bfe_u32 s14, s4, 0x80008
; GFX8-NOHSA-NEXT: s_and_b32 s15, s4, 0xff
; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s4, 0x80010
; GFX8-NOHSA-NEXT: s_and_b32 s16, s5, 0xff
-; GFX8-NOHSA-NEXT: s_and_b32 s17, s7, 0xff
-; GFX8-NOHSA-NEXT: s_and_b32 s18, s6, 0xff
+; GFX8-NOHSA-NEXT: s_and_b32 s17, s6, 0xff
; GFX8-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010
+; GFX8-NOHSA-NEXT: s_and_b32 s18, s7, 0xff
; GFX8-NOHSA-NEXT: s_bfe_u32 s5, s5, 0x80010
; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s7, 0x80010
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
@@ -6824,22 +6824,22 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x50
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x60
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s5
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 64
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x50
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x60
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 64
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -6976,9 +6976,15 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: s_bfe_u32 s2, s7, 0x80008
+; GFX12-NEXT: s_and_b32 s3, s7, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_lshr_b32 s2, s6, 24
; GFX12-NEXT: s_bfe_u32 s3, s6, 0x80010
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
@@ -6988,15 +6994,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: s_bfe_u32 s2, s7, 0x80008
-; GFX12-NEXT: s_and_b32 s3, s7, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_bfe_u32 s2, s5, 0x80008
; GFX12-NEXT: s_and_b32 s3, s5, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 9054e509cde8e..dac516e4ca1b3 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -6592,23 +6592,23 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, 0
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v6
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, 0xffff, v7
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v5
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v7
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v4
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, 0xffff, v6
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v5
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v7
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v21
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v21
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v21
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v21
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v21
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, v21
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v21
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v21
@@ -6617,12 +6617,12 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v21
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
@@ -6649,7 +6649,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4
-; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50
+; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70
; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
@@ -6659,61 +6659,61 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v20, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v10, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v23, v8
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, v8
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, v8
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v1
; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v1
; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[11:14]
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4
-; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70
+; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v5
-; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v5
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v7
+; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v7
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[14:17]
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v7
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v5
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4
-; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v7
+; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v5
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[17:20]
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v3
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v3
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 64
-; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[7:10]
-; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0
-; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x60
-; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[7:10]
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
; GCN-HSA-NEXT: v_mov_b32_e32 v11, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v13, v8
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v4
-; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v4
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v2
-; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v2
; GCN-HSA-NEXT: v_mov_b32_e32 v21, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, v8
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, v8
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v6
-; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s1
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s0
+; GCN-HSA-NEXT: s_add_u32 s0, s0, 64
+; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v6
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v4
+; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v4
+; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v6
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s1
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v2
+; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v2
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s0
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v0
; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[10:13]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[20:23]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[10:13]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[7:8], v[20:23]
; GCN-HSA-NEXT: flat_store_dwordx4 v[3:4], v[14:17]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[6:9]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[24:27]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64:
@@ -6728,25 +6728,21 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, v29
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v29
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v29
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v29
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v29
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v29
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v29
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v29
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v29
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v29
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v29
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v5
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v4
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v4
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v6
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v6
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v5
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v0
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v1
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v1
@@ -6754,20 +6750,24 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v2
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v3
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v6
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v6
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v7
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v7
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v29
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v29
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v29
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v29
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, v5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v5
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
@@ -7120,50 +7120,50 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v0
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v7
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v2
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v1, 0, 16
-; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v15, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v7, 0, 16
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v1, 0, 16
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v4, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v1, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v3
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v7
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v5, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v6, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:112
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v9, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v4, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v6, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v11, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v13, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v3, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v22, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v7, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v20, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v13, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v3, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v5, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 31, v27
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 31, v29
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 31, v26
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
index 0f1c1cf0d80af..5a9016fa6affb 100644
--- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
@@ -410,22 +410,22 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280
; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280
-; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284
-; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272
-; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284
+; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280
+; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:42
-; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:40
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:46
+; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:44
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:46
-; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:44
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:42
+; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:40
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:34
-; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:32
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:38
+; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:36
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:38
-; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:36
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:34
+; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:32
; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224
; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228
; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232
@@ -668,14 +668,14 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:53
; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:51
; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:49
-; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:43
-; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:41
-; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:47
-; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:45
-; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:35
-; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:33
-; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:39
-; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:37
+; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:47
+; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:45
+; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:43
+; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:41
+; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:39
+; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:37
+; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:35
+; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:33
; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:31
; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:29
; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:27
@@ -1164,22 +1164,22 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1)
; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280
; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280
-; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284
-; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272
-; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284
+; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280
+; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v15, off offset:42
-; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:40
+; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v15, off offset:46
+; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:44
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v14, off offset:46
-; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:44
+; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v14, off offset:42
+; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:40
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v13, off offset:34
-; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:32
+; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v13, off offset:38
+; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:36
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v12, off offset:38
-; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:36
+; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v12, off offset:34
+; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:32
; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224
; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228
; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232
@@ -1422,14 +1422,14 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1)
; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:53
; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:51
; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:49
-; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:43
-; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:41
-; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:47
-; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:45
-; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:35
-; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:33
-; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:39
-; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:37
+; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:47
+; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:45
+; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:43
+; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:41
+; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:39
+; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:37
+; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:35
+; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:33
; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:31
; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:29
; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:27
@@ -1738,13 +1738,13 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:194
; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:192
; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v55
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v53
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v53
; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v55
; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:203
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v52
; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:201
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v53
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v52
; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:207
; ALIGNED-NEXT: s_waitcnt vmcnt(9)
; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v50
@@ -1792,18 +1792,18 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v34
; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:177
; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34
-; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252
; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252
; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:244
; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:240
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:170
-; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:174
; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:172
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162
-; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160
+; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:166
; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:164
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162
+; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160
; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v35
; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v35
; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v33
@@ -1816,15 +1816,15 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v31
; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
-; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:163
+; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:167
; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v31
-; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:161
+; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:165
; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v29
; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:175
; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v30
; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v30
-; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:167
-; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:165
+; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:163
+; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:161
; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v29
; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200
; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:204
@@ -3753,40 +3753,40 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:30
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:31
-; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32
+; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:32
; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33
; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34
; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29
; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
-; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35
-; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36
+; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:35
+; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:36
; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37
-; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:38
+; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:38
; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:39
; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:40
-; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:41
-; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:42
-; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:43
+; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:41
+; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:42
; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:44
-; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:45
-; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:46
-; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:47
+; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:45
+; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:46
+; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:47
; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:48
; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:49
; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:50
+; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:43
; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:51
; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:52
-; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:53
-; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:54
-; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:53
+; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:54
+; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:55
; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56
; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57
; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:60
-; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:61
+; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:61
; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:62
-; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:63
; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:64
; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:65
; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:66
@@ -3821,7 +3821,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: s_waitcnt vmcnt(49)
; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(48)
-; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
; ALIGNED-NEXT: s_waitcnt vmcnt(45)
@@ -3832,49 +3832,47 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v5
; ALIGNED-NEXT: s_waitcnt vmcnt(41)
-; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 8, v6
; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v7
; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12
-; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14
-; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17
+; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v13
+; ALIGNED-NEXT: v_lshl_or_b32 v8, v18, 8, v17
; ALIGNED-NEXT: s_waitcnt vmcnt(40)
-; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13
+; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v14
; ALIGNED-NEXT: s_waitcnt vmcnt(38)
-; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18
-; ALIGNED-NEXT: s_waitcnt vmcnt(36)
-; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22
-; ALIGNED-NEXT: s_waitcnt vmcnt(34)
-; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25
-; ALIGNED-NEXT: s_waitcnt vmcnt(32)
-; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21
-; ALIGNED-NEXT: s_waitcnt vmcnt(30)
-; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26
+; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v19
+; ALIGNED-NEXT: s_waitcnt vmcnt(33)
+; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v21
+; ALIGNED-NEXT: v_lshl_or_b32 v13, v26, 8, v22
+; ALIGNED-NEXT: s_waitcnt vmcnt(31)
+; ALIGNED-NEXT: v_lshl_or_b32 v12, v25, 8, v24
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5
+; ALIGNED-NEXT: s_waitcnt vmcnt(27)
+; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v28
; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7
; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 16, v9
; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11
+; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13
-; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(28)
; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(26)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(24)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v35, 8, v32
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(22)
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v35
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v37, 8, v36
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(17)
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v50, 8, v38
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v49, 8, v38
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(15)
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v50, 8, v39
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
@@ -3900,7 +3898,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82
-; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(7)
@@ -3911,34 +3909,34 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(7)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71
-; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73
-; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72
+; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
@@ -4258,15 +4256,15 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:156
+; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:156
; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:157
-; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:158
; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:159
; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:155
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 8, v121
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 8, v120
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v107, 8, v108
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v107, 8, v110
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x2
@@ -4405,47 +4403,47 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: v_lshl_or_b32 v4, v71, 8, v68
; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v54, 8, v67
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v65
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x7
-; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:215
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v53, 8, v66
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v67
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v49
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v54, 8, v53
; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v64
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v55, 8, v66
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v39, 8, v50
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v64
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v49, 8, v51
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v50
; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:218
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v38
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v37, 8, v38
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v55, 8, v37
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v39, 8, v36
; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x7
@@ -4549,14 +4547,14 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x4
; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:12
-; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:13
-; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:14
-; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:15
; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:11
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v44, v111, 8, v122
+; ALIGNED-NEXT: v_lshl_or_b32 v44, v121, 8, v122
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v58, v110, 8, v120
+; ALIGNED-NEXT: v_lshl_or_b32 v58, v108, 8, v111
; ALIGNED-NEXT: v_lshl_or_b32 v0, v58, 16, v44
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x2
@@ -4633,32 +4631,32 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: flat_store_byte v[3:4], v35 offset:224
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196
-; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:213
-; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:215
-; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:209
-; ALIGNED-NEXT: flat_store_byte v[3:4], v55 offset:211
-; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:210
-; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:214
-; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:212
-; ALIGNED-NEXT: flat_store_byte v[3:4], v49 offset:218
-; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:219
-; ALIGNED-NEXT: flat_store_byte v[3:4], v53 offset:217
-; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:223
-; ALIGNED-NEXT: flat_store_byte v[3:4], v51 offset:221
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:218
+; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:219
+; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:217
+; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:223
+; ALIGNED-NEXT: flat_store_byte v[3:4], v49 offset:221
; ALIGNED-NEXT: flat_store_byte v[3:4], v50 offset:222
-; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:220
-; ALIGNED-NEXT: flat_store_byte v[3:4], v66 offset:216
-; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:208
+; ALIGNED-NEXT: flat_store_byte v[3:4], v51 offset:220
+; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:216
+; ALIGNED-NEXT: flat_store_byte v[3:4], v53 offset:210
+; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:211
+; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:209
+; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:215
+; ALIGNED-NEXT: flat_store_byte v[3:4], v55 offset:213
+; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:214
+; ALIGNED-NEXT: flat_store_byte v[3:4], v66 offset:212
+; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:208
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216
@@ -4760,8 +4758,8 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: flat_store_byte v[3:4], v93 offset:153
; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:159
; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:157
-; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:158
-; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:156
+; ALIGNED-NEXT: flat_store_byte v[3:4], v110 offset:158
+; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:156
; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:152
; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:146
; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:147
@@ -5093,9 +5091,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:61
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:58
@@ -5105,9 +5100,12 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:57
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:63
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:61
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:62
@@ -5117,21 +5115,21 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:56
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:53
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:50
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:51
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:49
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:55
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:53
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:54
@@ -5143,10 +5141,10 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:48
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436
@@ -5155,52 +5153,52 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:43
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:42
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:41
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:40
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:43
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:47
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:41
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:46
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:47
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:45
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:46
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:44
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:35
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:40
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:34
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:33
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:35
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:32
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:33
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:39
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:37
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:38
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:37
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:36
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:36
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:32
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392
@@ -5267,10 +5265,10 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400
; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:10
; ALIGNED-NEXT: flat_store_byte v[3:4], v94 offset:11
-; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:13
; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:9
-; ALIGNED-NEXT: flat_store_byte v[3:4], v110 offset:15
-; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:14
+; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:15
+; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:13
+; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:14
; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:12
; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:8
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload
@@ -5888,22 +5886,22 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280
; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280
-; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284
-; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272
-; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284
+; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280
+; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:42
-; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:40
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:46
+; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:44
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:46
-; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:44
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:42
+; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:40
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:34
-; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:32
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:38
+; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:36
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:38
-; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:36
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:34
+; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:32
; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224
; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228
; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232
@@ -6146,14 +6144,14 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:53
; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:51
; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:49
-; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:43
-; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:41
-; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:47
-; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:45
-; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:35
-; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:33
-; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:39
-; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:37
+; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:47
+; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:45
+; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:43
+; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:41
+; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:39
+; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:37
+; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:35
+; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:33
; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:31
; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:29
; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:27
@@ -6517,22 +6515,22 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:536
; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:540
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:536
-; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:540
-; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:528
-; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:532
+; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:540
+; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:536
+; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:532
+; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:528
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:42
-; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:40
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:46
+; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:44
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:46
-; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:44
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:42
+; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:40
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:34
-; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:32
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:38
+; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:36
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:38
-; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:36
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:34
+; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:32
; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480
; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:484
; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488
@@ -6775,14 +6773,14 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:53
; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:51
; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:49
-; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:43
-; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:41
-; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:47
-; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:45
-; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:35
-; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:33
-; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:39
-; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:37
+; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:47
+; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:45
+; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:43
+; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:41
+; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:39
+; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:37
+; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:35
+; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:33
; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:31
; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:29
; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:27
@@ -7389,22 +7387,22 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280
; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280
-; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284
-; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272
-; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284
+; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280
+; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v15, off offset:42
-; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:40
+; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v15, off offset:46
+; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:44
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v14, off offset:46
-; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:44
+; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v14, off offset:42
+; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:40
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v13, off offset:34
-; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:32
+; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v13, off offset:38
+; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:36
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v12, off offset:38
-; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:36
+; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v12, off offset:34
+; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:32
; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224
; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228
; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232
@@ -7647,14 +7645,14 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:53
; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:51
; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:49
-; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:43
-; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:41
-; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:47
-; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:45
-; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:35
-; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:33
-; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:39
-; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:37
+; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:47
+; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:45
+; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:43
+; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:41
+; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:39
+; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:37
+; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:35
+; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:33
; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:31
; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:29
; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:27
@@ -8016,22 +8014,22 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:536
; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:540
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:536
-; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:540
-; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:528
-; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:532
+; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:540
+; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:536
+; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:532
+; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:528
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v15, off offset:42
-; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:40
+; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v15, off offset:46
+; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:44
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v14, off offset:46
-; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:44
+; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v14, off offset:42
+; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:40
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v13, off offset:34
-; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:32
+; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v13, off offset:38
+; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:36
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v12, off offset:38
-; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:36
+; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v12, off offset:34
+; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:32
; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480
; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:484
; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488
@@ -8274,14 +8272,14 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:53
; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:51
; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:49
-; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:43
-; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:41
-; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:47
-; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:45
-; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:35
-; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:33
-; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:39
-; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:37
+; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:47
+; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:45
+; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:43
+; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:41
+; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:39
+; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:37
+; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:35
+; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:33
; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:31
; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:29
; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:27
@@ -8708,14 +8706,14 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:196
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:194
; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:192
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v53
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v53
; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:203
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v53
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v52
; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v55
; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v55
; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:201
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v53
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v52
; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:207
; ALIGNED-NEXT: s_waitcnt vmcnt(9)
; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v50
@@ -8763,18 +8761,18 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v34
; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:177
; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34
-; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252
; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252
; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:244
; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:240
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:170
-; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:174
; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:172
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162
-; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160
+; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:166
; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:164
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162
+; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160
; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v35
; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v35
; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v33
@@ -8783,18 +8781,18 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v32
; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:169
; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v32
-; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:163
+; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:167
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v31
-; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:161
+; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:165
; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v31
-; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:167
+; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:163
; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v29
; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:175
; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v30
; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:173
; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v30
-; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:165
+; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:161
; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v29
; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200
; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:204
@@ -9207,13 +9205,13 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:194
; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:192
; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v55
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v53
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v53
; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v55
; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:203
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v52
; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:201
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v53
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v52
; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:207
; ALIGNED-NEXT: s_waitcnt vmcnt(9)
; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v50
@@ -9261,18 +9259,18 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v34
; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:177
; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34
-; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:508
; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:504
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:508
; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:500
; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:496
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:170
-; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:174
; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:172
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162
-; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160
+; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:166
; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:164
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162
+; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160
; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v35
; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v35
; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v33
@@ -9284,15 +9282,15 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:173
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v31
-; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:163
+; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:167
; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v31
-; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:161
+; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:165
; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v29
; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:175
; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v30
; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v30
-; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:167
-; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:165
+; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:163
+; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:161
; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v29
; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:456
; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:460
@@ -12741,42 +12739,42 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:30
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:31
-; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32
+; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:32
; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33
; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34
; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29
; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
-; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35
-; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36
+; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:35
+; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:36
; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37
-; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:38
+; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:38
; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:39
-; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:40
-; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:41
-; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:42
-; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:43
-; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:44
-; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:45
-; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:46
-; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:47
+; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:40
+; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:41
+; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:42
+; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:46
+; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:47
; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:48
; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:49
-; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:50
-; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:51
+; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:50
+; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:45
+; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:44
+; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:43
+; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:51
; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:52
-; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:53
-; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:54
-; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:53
+; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:54
+; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:55
; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:56
; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:57
; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:62
-; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:63
; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64
; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65
; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:66
-; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:61
+; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:61
; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60
; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59
; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:67
@@ -12809,7 +12807,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_waitcnt vmcnt(49)
; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(48)
-; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
; ALIGNED-NEXT: s_waitcnt vmcnt(45)
@@ -12820,48 +12818,46 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v5
; ALIGNED-NEXT: s_waitcnt vmcnt(41)
-; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 8, v6
; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v7
; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12
-; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14
-; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17
+; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v13
+; ALIGNED-NEXT: v_lshl_or_b32 v8, v18, 8, v17
; ALIGNED-NEXT: s_waitcnt vmcnt(40)
-; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13
+; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v14
; ALIGNED-NEXT: s_waitcnt vmcnt(38)
-; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18
-; ALIGNED-NEXT: s_waitcnt vmcnt(36)
-; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22
-; ALIGNED-NEXT: s_waitcnt vmcnt(34)
-; ALIGNED-NEXT: v_lshl_or_b32 v12, v27, 8, v25
-; ALIGNED-NEXT: s_waitcnt vmcnt(32)
-; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21
-; ALIGNED-NEXT: s_waitcnt vmcnt(30)
-; ALIGNED-NEXT: v_lshl_or_b32 v14, v28, 8, v26
+; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v19
+; ALIGNED-NEXT: s_waitcnt vmcnt(33)
+; ALIGNED-NEXT: v_lshl_or_b32 v12, v24, 8, v22
+; ALIGNED-NEXT: v_lshl_or_b32 v13, v26, 8, v23
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5
+; ALIGNED-NEXT: s_waitcnt vmcnt(28)
+; ALIGNED-NEXT: v_lshl_or_b32 v11, v25, 8, v21
+; ALIGNED-NEXT: s_waitcnt vmcnt(27)
+; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v28
; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7
; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 16, v9
+; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11
; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13
-; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(28)
; ALIGNED-NEXT: v_lshl_or_b32 v15, v30, 8, v29
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(26)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v32, 8, v34
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v33, 8, v35
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(24)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v36, 8, v31
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v32, 8, v31
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(22)
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v33
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v34
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(12)
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v37
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v39, 8, v38
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v39, 8, v37
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v48, 8, v38
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v6, v50, 8, v49
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
@@ -12887,7 +12883,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82
-; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(7)
@@ -12898,35 +12894,35 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(7)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71
-; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73
; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
@@ -13199,7 +13195,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:138
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:137
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill
@@ -13388,47 +13384,47 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v4, v81, 8, v69
; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v54, 8, v68
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v53, 8, v66
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x7
-; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:215
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v67
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v68
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v50, 8, v49
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v54, 8, v55
; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v65
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v67
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v52
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v53, 8, v64
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v52
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v49, 8, v50
; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:218
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v37, 8, v39
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v38, 8, v48
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v55, 8, v38
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v39, 8, v37
; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x7
@@ -13529,24 +13525,24 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v0, v91, 16, v77
; ALIGNED-NEXT: v_lshl_or_b32 v77, v123, 8, v106
; ALIGNED-NEXT: v_lshl_or_b32 v91, v3, 8, v125
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:14
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v91, 16, v77
; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:15
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:13
; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:11
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v77, v3, 8, v1
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v91, v91, 8, v3
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v91, v91, 8, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v77, v0, 8, v1
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v91, 16, v77
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x2
@@ -13623,32 +13619,32 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:224
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196
-; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:213
-; ALIGNED-NEXT: flat_store_byte v[3:4], v53 offset:215
-; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:209
-; ALIGNED-NEXT: flat_store_byte v[3:4], v55 offset:211
-; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:210
-; ALIGNED-NEXT: flat_store_byte v[3:4], v66 offset:214
-; ALIGNED-NEXT: flat_store_byte v[3:4], v68 offset:212
-; ALIGNED-NEXT: flat_store_byte v[3:4], v49 offset:218
-; ALIGNED-NEXT: flat_store_byte v[3:4], v50 offset:219
-; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:217
-; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:223
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:218
+; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:219
+; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:217
+; ALIGNED-NEXT: flat_store_byte v[3:4], v49 offset:223
; ALIGNED-NEXT: flat_store_byte v[3:4], v51 offset:221
-; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:222
-; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:220
-; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:216
-; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:208
+; ALIGNED-NEXT: flat_store_byte v[3:4], v50 offset:222
+; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:220
+; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:216
+; ALIGNED-NEXT: flat_store_byte v[3:4], v55 offset:210
+; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:211
+; ALIGNED-NEXT: flat_store_byte v[3:4], v66 offset:209
+; ALIGNED-NEXT: flat_store_byte v[3:4], v53 offset:215
+; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:213
+; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:214
+; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:212
+; ALIGNED-NEXT: flat_store_byte v[3:4], v68 offset:208
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216
@@ -13780,7 +13776,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:138
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:139
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload
@@ -14077,9 +14073,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:61
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:58
@@ -14092,31 +14085,34 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:63
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:61
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:62
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:60
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:56
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:53
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:50
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:51
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:49
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:55
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:53
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:54
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
@@ -14127,10 +14123,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:48
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436
@@ -14139,53 +14135,53 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:43
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:42
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:41
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:40
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:43
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:47
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:41
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:46
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:47
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:45
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:46
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:44
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:35
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:40
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:34
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:33
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:35
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:32
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:33
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:39
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:37
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:38
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:37
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:36
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:36
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:32
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
@@ -14251,15 +14247,15 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400
; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:10
; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:11
+; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:9
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:15
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13
-; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:9
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:15
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:14
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -14310,40 +14306,40 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26
; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:30
; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:31
-; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:32
+; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:32
; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:33
; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:34
; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:29
; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:28
; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27
-; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:35
-; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:36
+; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:35
+; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:36
; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:37
-; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:38
+; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:38
; ALIGNED-NEXT: buffer_load_ubyte v20, v4, s[0:3], 0 offen offset:39
; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:40
-; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:41
-; ALIGNED-NEXT: buffer_load_ubyte v25, v4, s[0:3], 0 offen offset:42
-; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:43
+; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:41
+; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:42
; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:44
-; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:45
-; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:46
-; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:47
+; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:45
+; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:46
+; ALIGNED-NEXT: buffer_load_ubyte v25, v4, s[0:3], 0 offen offset:47
; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:48
; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:49
; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:50
+; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:43
; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:51
; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:52
-; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:53
-; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:54
-; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:53
+; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:54
+; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:55
; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56
; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57
; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58
; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:60
-; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:61
+; ALIGNED-NEXT: buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:61
; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:62
-; ALIGNED-NEXT: buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:63
; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:64
; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:65
; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:66
@@ -14379,7 +14375,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_waitcnt vmcnt(50)
; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(49)
-; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
; ALIGNED-NEXT: s_waitcnt vmcnt(46)
@@ -14390,49 +14386,47 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v2, v9, 8, v5
; ALIGNED-NEXT: s_waitcnt vmcnt(42)
-; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v3, v8, 8, v6
; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v7
; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12
-; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14
-; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17
+; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v13
+; ALIGNED-NEXT: v_lshl_or_b32 v8, v18, 8, v17
; ALIGNED-NEXT: s_waitcnt vmcnt(41)
-; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13
+; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v14
; ALIGNED-NEXT: s_waitcnt vmcnt(39)
-; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18
-; ALIGNED-NEXT: s_waitcnt vmcnt(37)
-; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22
-; ALIGNED-NEXT: s_waitcnt vmcnt(35)
-; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25
-; ALIGNED-NEXT: s_waitcnt vmcnt(33)
-; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21
-; ALIGNED-NEXT: s_waitcnt vmcnt(31)
-; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26
+; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v19
+; ALIGNED-NEXT: s_waitcnt vmcnt(34)
+; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v21
+; ALIGNED-NEXT: v_lshl_or_b32 v13, v26, 8, v22
+; ALIGNED-NEXT: s_waitcnt vmcnt(32)
+; ALIGNED-NEXT: v_lshl_or_b32 v12, v25, 8, v24
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v6, 16, v5
+; ALIGNED-NEXT: s_waitcnt vmcnt(28)
+; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v28
; ALIGNED-NEXT: v_lshl_or_b32 v3, v8, 16, v7
; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 16, v9
; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11
+; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13
-; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(29)
; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(27)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(25)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v35, 8, v32
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(23)
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v36, 8, v35
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v37, 8, v36
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(18)
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v38
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v49, 8, v38
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(16)
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v50, 8, v39
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
@@ -14458,7 +14452,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:86
; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:82
-; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(8)
@@ -14469,34 +14463,34 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(8)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71
-; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:73
-; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:72
+; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
@@ -14819,12 +14813,12 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:156
; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:157
; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:158
-; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:159
; ALIGNED-NEXT: buffer_load_ubyte v106, v4, s[0:3], 0 offen offset:155
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v111, 8, v122
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v109, 8, v120
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v108, 8, v120
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x2
@@ -14964,47 +14958,47 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v68
; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v54, 8, v67
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v65
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x7
-; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT: buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT: buffer_load_ubyte v64, v4, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT: buffer_load_ubyte v64, v4, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:215
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v53, 8, v66
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v65, 8, v67
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v48, 8, v49
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v54, 8, v53
; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v51, 8, v64
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v55, 8, v66
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v50
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v64
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT: buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v49, 8, v51
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v48, 8, v50
; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:218
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v36, 8, v38
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v37, 8, v38
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v55, 8, v37
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v36
; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x7
@@ -15108,14 +15102,14 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x4
; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:12
-; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:13
; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:14
-; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:15
; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:11
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v43, v107, 8, v121
+; ALIGNED-NEXT: v_lshl_or_b32 v43, v109, 8, v121
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v57, v108, 8, v110
+; ALIGNED-NEXT: v_lshl_or_b32 v57, v107, 8, v110
; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 16, v43
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x2
@@ -15192,32 +15186,32 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[2:3], v35 offset:224
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452
-; ALIGNED-NEXT: flat_store_byte v[2:3], v54 offset:213
-; ALIGNED-NEXT: flat_store_byte v[2:3], v52 offset:215
-; ALIGNED-NEXT: flat_store_byte v[2:3], v36 offset:209
-; ALIGNED-NEXT: flat_store_byte v[2:3], v55 offset:211
-; ALIGNED-NEXT: flat_store_byte v[2:3], v37 offset:210
-; ALIGNED-NEXT: flat_store_byte v[2:3], v65 offset:214
-; ALIGNED-NEXT: flat_store_byte v[2:3], v67 offset:212
-; ALIGNED-NEXT: flat_store_byte v[2:3], v49 offset:218
-; ALIGNED-NEXT: flat_store_byte v[2:3], v48 offset:219
-; ALIGNED-NEXT: flat_store_byte v[2:3], v53 offset:217
-; ALIGNED-NEXT: flat_store_byte v[2:3], v39 offset:223
-; ALIGNED-NEXT: flat_store_byte v[2:3], v51 offset:221
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448
+; ALIGNED-NEXT: flat_store_byte v[2:3], v36 offset:218
+; ALIGNED-NEXT: flat_store_byte v[2:3], v39 offset:219
+; ALIGNED-NEXT: flat_store_byte v[2:3], v37 offset:217
+; ALIGNED-NEXT: flat_store_byte v[2:3], v48 offset:223
+; ALIGNED-NEXT: flat_store_byte v[2:3], v49 offset:221
; ALIGNED-NEXT: flat_store_byte v[2:3], v50 offset:222
-; ALIGNED-NEXT: flat_store_byte v[2:3], v64 offset:220
-; ALIGNED-NEXT: flat_store_byte v[2:3], v66 offset:216
-; ALIGNED-NEXT: flat_store_byte v[2:3], v38 offset:208
+; ALIGNED-NEXT: flat_store_byte v[2:3], v51 offset:220
+; ALIGNED-NEXT: flat_store_byte v[2:3], v38 offset:216
+; ALIGNED-NEXT: flat_store_byte v[2:3], v53 offset:210
+; ALIGNED-NEXT: flat_store_byte v[2:3], v54 offset:211
+; ALIGNED-NEXT: flat_store_byte v[2:3], v65 offset:209
+; ALIGNED-NEXT: flat_store_byte v[2:3], v52 offset:215
+; ALIGNED-NEXT: flat_store_byte v[2:3], v55 offset:213
+; ALIGNED-NEXT: flat_store_byte v[2:3], v64 offset:214
+; ALIGNED-NEXT: flat_store_byte v[2:3], v66 offset:212
+; ALIGNED-NEXT: flat_store_byte v[2:3], v67 offset:208
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472
@@ -15317,7 +15311,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[2:3], v92 offset:154
; ALIGNED-NEXT: flat_store_byte v[2:3], v106 offset:155
; ALIGNED-NEXT: flat_store_byte v[2:3], v94 offset:153
-; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:159
+; ALIGNED-NEXT: flat_store_byte v[2:3], v108 offset:159
; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:157
; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:158
; ALIGNED-NEXT: flat_store_byte v[2:3], v122 offset:156
@@ -15656,9 +15650,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:61
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:58
@@ -15668,9 +15659,12 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:57
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:63
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:61
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:62
@@ -15680,21 +15674,21 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:56
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:53
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:50
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:51
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:49
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:55
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:53
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:54
@@ -15706,10 +15700,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:48
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692
@@ -15718,52 +15712,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:43
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:42
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:41
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:40
+; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:43
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:47
+; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:41
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:46
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:47
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:45
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:46
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:44
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:35
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:40
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:34
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:33
+; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:35
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:32
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:33
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:39
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:37
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:38
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:37
+; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:36
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:36
+; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:32
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648
@@ -15830,9 +15824,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656
; ALIGNED-NEXT: flat_store_byte v[2:3], v90 offset:10
; ALIGNED-NEXT: flat_store_byte v[2:3], v93 offset:11
-; ALIGNED-NEXT: flat_store_byte v[2:3], v107 offset:13
; ALIGNED-NEXT: flat_store_byte v[2:3], v91 offset:9
-; ALIGNED-NEXT: flat_store_byte v[2:3], v108 offset:15
+; ALIGNED-NEXT: flat_store_byte v[2:3], v107 offset:15
+; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:13
; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:14
; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:12
; ALIGNED-NEXT: flat_store_byte v[2:3], v95 offset:8
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index e316c022727ab..a34e4e4bb5f48 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -1326,6 +1326,8 @@ define {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @vector_d
; CHECK-NEXT: srli a0, a0, 2
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v10, v8, a0
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; CHECK-NEXT: vslideup.vx v8, v10, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs2r.v v8, (a0)
@@ -1391,6 +1393,8 @@ define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @ve
; CHECK-NEXT: srli a0, a0, 2
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v10, v8, a0
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; CHECK-NEXT: vslideup.vx v8, v10, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs2r.v v8, (a0)
@@ -1456,6 +1460,8 @@ define {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} @vecto
; CHECK-NEXT: srli a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v10, v8, a0
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
; CHECK-NEXT: vslideup.vx v8, v10, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs2r.v v8, (a0)
@@ -1561,8 +1567,12 @@ define {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x
; CHECK-NEXT: srli a0, a0, 2
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v11, v9, a0
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; CHECK-NEXT: vslideup.vx v9, v11, a0
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v11, v8, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; CHECK-NEXT: vslideup.vx v8, v11, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs4r.v v8, (a0)
@@ -1648,8 +1658,12 @@ define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vs
; CHECK-NEXT: srli a0, a0, 2
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v11, v9, a0
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; CHECK-NEXT: vslideup.vx v9, v11, a0
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v11, v8, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; CHECK-NEXT: vslideup.vx v8, v11, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs4r.v v8, (a0)
@@ -1735,8 +1749,12 @@ define {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscal
; CHECK-NEXT: srli a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v11, v9, a0
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
; CHECK-NEXT: vslideup.vx v9, v11, a0
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v11, v8, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
; CHECK-NEXT: vslideup.vx v8, v11, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs4r.v v8, (a0)
@@ -1882,10 +1900,16 @@ define {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x
; CHECK-NEXT: srli a0, a0, 2
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v12, v9, a0
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; CHECK-NEXT: vslideup.vx v9, v12, a0
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v12, v8, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; CHECK-NEXT: vslideup.vx v8, v12, a0
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v12, v10, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; CHECK-NEXT: vslideup.vx v10, v12, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs4r.v v8, (a0)
@@ -1977,10 +2001,16 @@ define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vs
; CHECK-NEXT: srli a0, a0, 2
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v12, v9, a0
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; CHECK-NEXT: vslideup.vx v9, v12, a0
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v12, v8, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; CHECK-NEXT: vslideup.vx v8, v12, a0
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v12, v10, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
; CHECK-NEXT: vslideup.vx v10, v12, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs4r.v v8, (a0)
@@ -2072,10 +2102,16 @@ define {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscal
; CHECK-NEXT: srli a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v12, v9, a0
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
; CHECK-NEXT: vslideup.vx v9, v12, a0
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v12, v8, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
; CHECK-NEXT: vslideup.vx v8, v12, a0
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v12, v10, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
; CHECK-NEXT: vslideup.vx v10, v12, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs4r.v v8, (a0)
diff --git a/llvm/test/CodeGen/X86/pr40730.ll b/llvm/test/CodeGen/X86/pr40730.ll
index 164bf203d0545..cd7958f4f1d71 100644
--- a/llvm/test/CodeGen/X86/pr40730.ll
+++ b/llvm/test/CodeGen/X86/pr40730.ll
@@ -17,10 +17,14 @@ define <8 x i32> @shuffle_v8i32_0dcd3f14(<8 x i32> %a, <8 x i32> %b) {
}
; CHECK: .LCPI1_0:
-; CHECK-NEXT: .quad 0x0000000e0000000d
-; CHECK-NEXT: .quad 0x0000000e0000000d
-; CHECK-NEXT: .quad 0x0000001000000000
-; CHECK-NEXT: .zero 8
+; CHECK-NEXT: .zero 4
+; CHECK-NEXT: .zero 4
+; CHECK-NEXT: .zero 4
+; CHECK-NEXT: .zero 4
+; CHECK-NEXT: .long 13
+; CHECK-NEXT: .long 14
+; CHECK-NEXT: .zero 4
+; CHECK-NEXT: .long 16
define <8 x i32> @shuffle_v8i32_0dcd3f14_constant(<8 x i32> %a0) {
; CHECK-LABEL: shuffle_v8i32_0dcd3f14_constant:
@@ -29,7 +33,9 @@ define <8 x i32> @shuffle_v8i32_0dcd3f14_constant(<8 x i32> %a0) {
; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,1,1,0]
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5],ymm0[6,7]
+; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; CHECK-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2]
+; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5],ymm0[6,7]
; CHECK-NEXT: retq
%res = shufflevector <8 x i32> %a0, <8 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, <8 x i32> <i32 0, i32 13, i32 12, i32 13, i32 3, i32 15, i32 1, i32 4>
ret <8 x i32> %res
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index e27a77ed2293d..440e793f9d521 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -33,8 +33,9 @@ define void @shuffle_v64i8_to_v32i8(ptr %L, ptr %S) nounwind {
; AVX512VL-FAST-ALL: # %bb.0:
; AVX512VL-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
+; AVX512VL-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
+; AVX512VL-FAST-ALL-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX512VL-FAST-ALL-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,2,5,7]
; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512VL-FAST-ALL-NEXT: vmovdqa %ymm2, (%rsi)
@@ -341,8 +342,9 @@ define <32 x i8> @trunc_shuffle_v32i16_v32i8_ofs1(<32 x i16> %a0) {
; AVX512VL-FAST-ALL-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
; AVX512VL-FAST-ALL: # %bb.0:
; AVX512VL-FAST-ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
-; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
+; AVX512VL-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX512VL-FAST-ALL-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX512VL-FAST-ALL-NEXT: vpshufb %ymm2, %ymm0, %ymm2
; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,5,7]
; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm2, %ymm0
; AVX512VL-FAST-ALL-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
index 9b19ec15c6f55..001852163f71a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
@@ -476,19 +476,19 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vpinsrw $2, %eax, %xmm4, %xmm1
; AVX512-NEXT: vmovd %xmm2, %eax
; AVX512-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm4
-; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3]
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm5
+; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3]
; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3]
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
; AVX512-NEXT: vmovq %xmm3, (%rsi)
; AVX512-NEXT: vmovq %xmm1, (%rdx)
-; AVX512-NEXT: vmovq %xmm5, (%rcx)
+; AVX512-NEXT: vmovq %xmm4, (%rcx)
; AVX512-NEXT: vmovq %xmm6, (%r8)
; AVX512-NEXT: vmovq %xmm0, (%r9)
; AVX512-NEXT: retq
@@ -507,19 +507,19 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
; AVX512-FCP-NEXT: vmovd %xmm2, %eax
; AVX512-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3]
+; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi)
; AVX512-FCP-NEXT: vmovq %xmm1, (%rdx)
-; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx)
+; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx)
; AVX512-FCP-NEXT: vmovq %xmm6, (%r8)
; AVX512-FCP-NEXT: vmovq %xmm0, (%r9)
; AVX512-FCP-NEXT: retq
@@ -542,19 +542,19 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm4, %xmm1
; AVX512DQ-NEXT: vmovd %xmm2, %eax
; AVX512DQ-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3]
+; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm5
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
; AVX512DQ-NEXT: vmovq %xmm3, (%rsi)
; AVX512DQ-NEXT: vmovq %xmm1, (%rdx)
-; AVX512DQ-NEXT: vmovq %xmm5, (%rcx)
+; AVX512DQ-NEXT: vmovq %xmm4, (%rcx)
; AVX512DQ-NEXT: vmovq %xmm6, (%r8)
; AVX512DQ-NEXT: vmovq %xmm0, (%r9)
; AVX512DQ-NEXT: retq
@@ -573,19 +573,19 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
; AVX512DQ-FCP-NEXT: vmovd %xmm2, %eax
; AVX512DQ-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi)
; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx)
; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8)
; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9)
; AVX512DQ-FCP-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
index 81fe19c4d8b56..2a2d471096231 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
@@ -719,17 +719,19 @@ define void @load_i8_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
;
; AVX512-FCP-LABEL: load_i8_stride2_vf32:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm0
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,2,5,7]
-; AVX512-FCP-NEXT: vpermt2q %ymm2, %ymm4, %ymm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpermt2q %ymm1, %ymm4, %ymm0
-; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX512-FCP-NEXT: vpermt2q %ymm3, %ymm4, %ymm0
+; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpermt2q %ymm2, %ymm4, %ymm1
+; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rdx)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -754,17 +756,19 @@ define void @load_i8_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
;
; AVX512DQ-FCP-LABEL: load_i8_stride2_vf32:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm0
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,2,5,7]
-; AVX512DQ-FCP-NEXT: vpermt2q %ymm2, %ymm4, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpermt2q %ymm1, %ymm4, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX512DQ-FCP-NEXT: vpermt2q %ymm3, %ymm4, %ymm0
+; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpermt2q %ymm2, %ymm4, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rdx)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
index 9c9dca82f60ca..a72433c7cccb2 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
@@ -4242,77 +4242,76 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm5
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
-; AVX512-FCP-NEXT: vpermd %zmm1, %zmm26, %zmm25
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
+; AVX512-FCP-NEXT: vpermd %zmm1, %zmm25, %zmm24
; AVX512-FCP-NEXT: movw $-30584, %r11w # imm = 0x8888
; AVX512-FCP-NEXT: kmovw %r11d, %k1
-; AVX512-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm25 {%k1}
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm25, %zmm24 {%k1}
; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm7
; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm8
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm9
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm10
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0]
-; AVX512-FCP-NEXT: vpermd %zmm1, %zmm6, %zmm29
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
+; AVX512-FCP-NEXT: vpermd %zmm1, %zmm26, %zmm29
; AVX512-FCP-NEXT: movw $8738, %r11w # imm = 0x2222
; AVX512-FCP-NEXT: kmovw %r11d, %k2
-; AVX512-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm29 {%k2}
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm29 {%k2}
; AVX512-FCP-NEXT: vmovdqa 32(%r10), %ymm2
-; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm3
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm12
-; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm13
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11]
+; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm1
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm11
+; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm12
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
-; AVX512-FCP-NEXT: vpermd %zmm11, %zmm16, %zmm27
+; AVX512-FCP-NEXT: vpermd %zmm13, %zmm16, %zmm27
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
; AVX512-FCP-NEXT: vpermd %zmm6, %zmm17, %zmm27 {%k1}
-; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm14
-; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm15
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[8],ymm15[8],ymm1[9],ymm15[9],ymm1[10],ymm15[10],ymm1[11],ymm15[11]
+; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm13
+; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm14
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm15
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[8],ymm14[8],ymm15[9],ymm14[9],ymm15[10],ymm14[10],ymm15[11],ymm14[11]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7]
; AVX512-FCP-NEXT: vpermd %zmm6, %zmm18, %zmm30
; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm0
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[8],ymm14[8],ymm0[9],ymm14[9],ymm0[10],ymm14[10],ymm0[11],ymm14[11]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[8],ymm13[8],ymm0[9],ymm13[9],ymm0[10],ymm13[10],ymm0[11],ymm13[11]
; AVX512-FCP-NEXT: vpermd %zmm6, %zmm19, %zmm30 {%k2}
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX512-FCP-NEXT: vmovdqa 32(%r10), %xmm11
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15]
-; AVX512-FCP-NEXT: vmovdqa 32(%rax), %xmm3
-; AVX512-FCP-NEXT: vpermd %zmm6, %zmm16, %zmm28
-; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm6
-; AVX512-FCP-NEXT: vpermd %zmm2, %zmm17, %zmm28 {%k1}
-; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm2
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm14[4],ymm0[5],ymm14[5],ymm0[6],ymm14[6],ymm0[7],ymm14[7],ymm0[12],ymm14[12],ymm0[13],ymm14[13],ymm0[14],ymm14[14],ymm0[15],ymm14[15]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX512-FCP-NEXT: vmovdqa 32(%r10), %xmm2
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm12[4],ymm11[4],ymm12[5],ymm11[5],ymm12[6],ymm11[6],ymm12[7],ymm11[7],ymm12[12],ymm11[12],ymm12[13],ymm11[13],ymm12[14],ymm11[14],ymm12[15],ymm11[15]
+; AVX512-FCP-NEXT: vmovdqa 32(%rax), %xmm6
+; AVX512-FCP-NEXT: vpermd %zmm11, %zmm16, %zmm28
+; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm11
+; AVX512-FCP-NEXT: vpermd %zmm1, %zmm17, %zmm28 {%k1}
+; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm1
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm13[4],ymm0[5],ymm13[5],ymm0[6],ymm13[6],ymm0[7],ymm13[7],ymm0[12],ymm13[12],ymm0[13],ymm13[13],ymm0[14],ymm13[14],ymm0[15],ymm13[15]
; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm13
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm15[4],ymm1[5],ymm15[5],ymm1[6],ymm15[6],ymm1[7],ymm15[7],ymm1[12],ymm15[12],ymm1[13],ymm15[13],ymm1[14],ymm15[14],ymm1[15],ymm15[15]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm15[4],ymm14[4],ymm15[5],ymm14[5],ymm15[6],ymm14[6],ymm15[7],ymm14[7],ymm15[12],ymm14[12],ymm15[13],ymm14[13],ymm15[14],ymm14[14],ymm15[15],ymm14[15]
; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm15
-; AVX512-FCP-NEXT: vpermd %zmm1, %zmm18, %zmm31
+; AVX512-FCP-NEXT: vpermd %zmm12, %zmm18, %zmm31
; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm12
; AVX512-FCP-NEXT: vpermd %zmm0, %zmm19, %zmm31 {%k2}
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
-; AVX512-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm21
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3]
-; AVX512-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm21 {%k1}
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm20, %zmm21
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm25, %zmm21 {%k1}
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
-; AVX512-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm20
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm20, %zmm20
; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm0
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
-; AVX512-FCP-NEXT: vpermd %zmm1, %zmm24, %zmm20 {%k2}
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; AVX512-FCP-NEXT: vpermd %zmm2, %zmm26, %zmm23
-; AVX512-FCP-NEXT: vpermd %zmm1, %zmm26, %zmm23 {%k1}
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512-FCP-NEXT: vpermd %zmm3, %zmm26, %zmm20 {%k2}
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
+; AVX512-FCP-NEXT: vpermd %zmm1, %zmm25, %zmm23
+; AVX512-FCP-NEXT: vpermd %zmm2, %zmm25, %zmm23 {%k1}
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
-; AVX512-FCP-NEXT: vpermd %zmm1, %zmm24, %zmm22
-; AVX512-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm22 {%k2}
+; AVX512-FCP-NEXT: vpermd %zmm1, %zmm26, %zmm22
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm22 {%k2}
; AVX512-FCP-NEXT: vmovdqa (%r10), %ymm0
; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm1
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
@@ -4342,15 +4341,15 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload
; AVX512-FCP-NEXT: # xmm3 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
-; AVX512-FCP-NEXT: vpermd %zmm3, %zmm26, %zmm3
-; AVX512-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm3 {%k1}
+; AVX512-FCP-NEXT: vpermd %zmm3, %zmm25, %zmm3
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm25, %zmm3 {%k1}
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX512-FCP-NEXT: vpermd %zmm4, %zmm24, %zmm4
-; AVX512-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm4 {%k2}
+; AVX512-FCP-NEXT: vpermd %zmm4, %zmm26, %zmm4
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm4 {%k2}
; AVX512-FCP-NEXT: movb $-86, %al
; AVX512-FCP-NEXT: kmovw %eax, %k1
-; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm29 {%k1}
; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm30 {%k1}
; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1}
; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 {%k1}
@@ -4523,77 +4522,76 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm5
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
-; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm26, %zmm25
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
+; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm25, %zmm24
; AVX512DQ-FCP-NEXT: movw $-30584, %r11w # imm = 0x8888
; AVX512DQ-FCP-NEXT: kmovw %r11d, %k1
-; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm25 {%k1}
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm25, %zmm24 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm7
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm8
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm9
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm10
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm6, %zmm29
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm26, %zmm29
; AVX512DQ-FCP-NEXT: movw $8738, %r11w # imm = 0x2222
; AVX512DQ-FCP-NEXT: kmovw %r11d, %k2
-; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm29 {%k2}
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm29 {%k2}
; AVX512DQ-FCP-NEXT: vmovdqa 32(%r10), %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm3
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm12
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm13
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11]
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm1
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm11
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm12
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
-; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm16, %zmm27
+; AVX512DQ-FCP-NEXT: vpermd %zmm13, %zmm16, %zmm27
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm17, %zmm27 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm15
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[8],ymm15[8],ymm1[9],ymm15[9],ymm1[10],ymm15[10],ymm1[11],ymm15[11]
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm13
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm14
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm15
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[8],ymm14[8],ymm15[9],ymm14[9],ymm15[10],ymm14[10],ymm15[11],ymm14[11]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7]
; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm18, %zmm30
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm0
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[8],ymm14[8],ymm0[9],ymm14[9],ymm0[10],ymm14[10],ymm0[11],ymm14[11]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[8],ymm13[8],ymm0[9],ymm13[9],ymm0[10],ymm13[10],ymm0[11],ymm13[11]
; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm19, %zmm30 {%k2}
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r10), %xmm11
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15]
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %xmm3
-; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm16, %zmm28
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm6
-; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm17, %zmm28 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm2
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm14[4],ymm0[5],ymm14[5],ymm0[6],ymm14[6],ymm0[7],ymm14[7],ymm0[12],ymm14[12],ymm0[13],ymm14[13],ymm0[14],ymm14[14],ymm0[15],ymm14[15]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%r10), %xmm2
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm12[4],ymm11[4],ymm12[5],ymm11[5],ymm12[6],ymm11[6],ymm12[7],ymm11[7],ymm12[12],ymm11[12],ymm12[13],ymm11[13],ymm12[14],ymm11[14],ymm12[15],ymm11[15]
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %xmm6
+; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm16, %zmm28
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm11
+; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm17, %zmm28 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm1
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm13[4],ymm0[5],ymm13[5],ymm0[6],ymm13[6],ymm0[7],ymm13[7],ymm0[12],ymm13[12],ymm0[13],ymm13[13],ymm0[14],ymm13[14],ymm0[15],ymm13[15]
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm13
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm15[4],ymm1[5],ymm15[5],ymm1[6],ymm15[6],ymm1[7],ymm15[7],ymm1[12],ymm15[12],ymm1[13],ymm15[13],ymm1[14],ymm15[14],ymm1[15],ymm15[15]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm15[4],ymm14[4],ymm15[5],ymm14[5],ymm15[6],ymm14[6],ymm15[7],ymm14[7],ymm15[12],ymm14[12],ymm15[13],ymm14[13],ymm15[14],ymm14[14],ymm15[15],ymm14[15]
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm15
-; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm18, %zmm31
+; AVX512DQ-FCP-NEXT: vpermd %zmm12, %zmm18, %zmm31
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm12
; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm19, %zmm31 {%k2}
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
-; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm21
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3]
-; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm21 {%k1}
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm20, %zmm21
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm25, %zmm21 {%k1}
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm20
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm20, %zmm20
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm0
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
-; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm24, %zmm20 {%k2}
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm26, %zmm23
-; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm26, %zmm23 {%k1}
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm26, %zmm20 {%k2}
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
+; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm25, %zmm23
+; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm25, %zmm23 {%k1}
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
-; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm24, %zmm22
-; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm22 {%k2}
+; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm26, %zmm22
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm22 {%k2}
; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %ymm0
; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm1
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
@@ -4623,15 +4621,15 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FCP-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload
; AVX512DQ-FCP-NEXT: # xmm3 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
-; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm26, %zmm3
-; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm25, %zmm3
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm25, %zmm3 {%k1}
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm24, %zmm4
-; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm4 {%k2}
+; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm26, %zmm4
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm4 {%k2}
; AVX512DQ-FCP-NEXT: movb $-86, %al
; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm29 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm30 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 {%k1}
@@ -8080,18 +8078,17 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1
; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0]
-; AVX512-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm17
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
+; AVX512-FCP-NEXT: vpermd %zmm1, %zmm7, %zmm17
; AVX512-FCP-NEXT: movw $8738, %r11w # imm = 0x2222
; AVX512-FCP-NEXT: kmovw %r11d, %k1
; AVX512-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm17 {%k1}
; AVX512-FCP-NEXT: vmovdqa 96(%r10), %ymm5
; AVX512-FCP-NEXT: vmovdqa 96(%rax), %ymm4
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
-; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm13
+; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm12
; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm15
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
; AVX512-FCP-NEXT: vpermd %zmm11, %zmm23, %zmm0
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
@@ -8108,14 +8105,14 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11]
; AVX512-FCP-NEXT: vpermd %zmm9, %zmm21, %zmm10 {%k1}
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm15[4],ymm12[4],ymm15[5],ymm12[5],ymm15[6],ymm12[6],ymm15[7],ymm12[7],ymm15[12],ymm12[12],ymm15[13],ymm12[13],ymm15[14],ymm12[14],ymm15[15],ymm12[15]
; AVX512-FCP-NEXT: vpermd %zmm5, %zmm23, %zmm5
; AVX512-FCP-NEXT: vpermd %zmm4, %zmm22, %zmm5 {%k2}
; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15]
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; AVX512-FCP-NEXT: vpermd %zmm1, %zmm20, %zmm13
-; AVX512-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm13 {%k1}
+; AVX512-FCP-NEXT: vpermd %zmm1, %zmm20, %zmm12
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm12 {%k1}
; AVX512-FCP-NEXT: vmovdqa 64(%r10), %ymm0
; AVX512-FCP-NEXT: vmovdqa 64(%rax), %ymm1
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
@@ -8131,8 +8128,8 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[2],ymm5[2],ymm9[3],ymm5[3],ymm9[8],ymm5[8],ymm9[9],ymm5[9],ymm9[10],ymm5[10],ymm9[11],ymm5[11]
; AVX512-FCP-NEXT: vpermd %zmm8, %zmm20, %zmm16
; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm8
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11]
-; AVX512-FCP-NEXT: vpermd %zmm12, %zmm21, %zmm16 {%k1}
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11]
+; AVX512-FCP-NEXT: vpermd %zmm13, %zmm21, %zmm16 {%k1}
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
; AVX512-FCP-NEXT: vpermd %zmm1, %zmm23, %zmm19
@@ -8155,8 +8152,8 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-NEXT: vpermd %zmm8, %zmm20, %zmm24
; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm8
; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm9
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
-; AVX512-FCP-NEXT: vpermd %zmm12, %zmm21, %zmm24 {%k1}
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
+; AVX512-FCP-NEXT: vpermd %zmm13, %zmm21, %zmm24 {%k1}
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
; AVX512-FCP-NEXT: vpermd %zmm1, %zmm23, %zmm27
@@ -8241,8 +8238,8 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm8
; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm4
; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm5
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-FCP-NEXT: vpermd %zmm12, %zmm7, %zmm8 {%k1}
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-FCP-NEXT: vpermd %zmm13, %zmm7, %zmm8 {%k1}
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
; AVX512-FCP-NEXT: vpermd %zmm2, %zmm7, %zmm2
@@ -8251,22 +8248,22 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512-FCP-NEXT: vpermd %zmm5, %zmm7, %zmm5
-; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm12
+; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm13
; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm14
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
; AVX512-FCP-NEXT: vpermd %zmm11, %zmm7, %zmm5 {%k1}
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; AVX512-FCP-NEXT: vpermd %zmm3, %zmm7, %zmm3
; AVX512-FCP-NEXT: vpermd %zmm11, %zmm7, %zmm3 {%k1}
; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm4
; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm11
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
-; AVX512-FCP-NEXT: vpermd %zmm12, %zmm7, %zmm12
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
+; AVX512-FCP-NEXT: vpermd %zmm13, %zmm7, %zmm13
; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm14
; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm0
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512-FCP-NEXT: vpermd %zmm15, %zmm7, %zmm12 {%k1}
+; AVX512-FCP-NEXT: vpermd %zmm15, %zmm7, %zmm13 {%k1}
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
; AVX512-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm4
@@ -8286,7 +8283,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm8 {%k1}
; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm2 {%k1}
; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
@@ -8296,7 +8293,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1}
; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1}
; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1}
; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm22 {%k1}
; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1}
@@ -8306,7 +8303,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax)
; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 128(%rax)
; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 256(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax)
; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 448(%rax)
; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 384(%rax)
; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 576(%rax)
@@ -8315,7 +8312,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 640(%rax)
; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 832(%rax)
; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 768(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 960(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 960(%rax)
; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 896(%rax)
; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax)
; AVX512-FCP-NEXT: addq $264, %rsp # imm = 0x108
@@ -8627,18 +8624,17 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm17
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm7, %zmm17
; AVX512DQ-FCP-NEXT: movw $8738, %r11w # imm = 0x2222
; AVX512DQ-FCP-NEXT: kmovw %r11d, %k1
; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm17 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqa 96(%r10), %ymm5
; AVX512DQ-FCP-NEXT: vmovdqa 96(%rax), %ymm4
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm13
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm12
; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm15
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm23, %zmm0
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
@@ -8655,14 +8651,14 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11]
; AVX512DQ-FCP-NEXT: vpermd %zmm9, %zmm21, %zmm10 {%k1}
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm15[4],ymm12[4],ymm15[5],ymm12[5],ymm15[6],ymm12[6],ymm15[7],ymm12[7],ymm15[12],ymm12[12],ymm15[13],ymm12[13],ymm15[14],ymm12[14],ymm15[15],ymm12[15]
; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm23, %zmm5
; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm22, %zmm5 {%k2}
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15]
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm20, %zmm13
-; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm13 {%k1}
+; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm20, %zmm12
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm12 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqa 64(%r10), %ymm0
; AVX512DQ-FCP-NEXT: vmovdqa 64(%rax), %ymm1
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
@@ -8678,8 +8674,8 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[2],ymm5[2],ymm9[3],ymm5[3],ymm9[8],ymm5[8],ymm9[9],ymm5[9],ymm9[10],ymm5[10],ymm9[11],ymm5[11]
; AVX512DQ-FCP-NEXT: vpermd %zmm8, %zmm20, %zmm16
; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm8
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11]
-; AVX512DQ-FCP-NEXT: vpermd %zmm12, %zmm21, %zmm16 {%k1}
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11]
+; AVX512DQ-FCP-NEXT: vpermd %zmm13, %zmm21, %zmm16 {%k1}
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm23, %zmm19
@@ -8702,8 +8698,8 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FCP-NEXT: vpermd %zmm8, %zmm20, %zmm24
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm8
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm9
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
-; AVX512DQ-FCP-NEXT: vpermd %zmm12, %zmm21, %zmm24 {%k1}
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
+; AVX512DQ-FCP-NEXT: vpermd %zmm13, %zmm21, %zmm24 {%k1}
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm23, %zmm27
@@ -8788,8 +8784,8 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm8
; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm4
; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm5
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-FCP-NEXT: vpermd %zmm12, %zmm7, %zmm8 {%k1}
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512DQ-FCP-NEXT: vpermd %zmm13, %zmm7, %zmm8 {%k1}
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm7, %zmm2
@@ -8798,22 +8794,22 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm7, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm12
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm13
; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm14
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm7, %zmm5 {%k1}
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm7, %zmm3
; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm7, %zmm3 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm4
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm11
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
-; AVX512DQ-FCP-NEXT: vpermd %zmm12, %zmm7, %zmm12
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
+; AVX512DQ-FCP-NEXT: vpermd %zmm13, %zmm7, %zmm13
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm14
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm0
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512DQ-FCP-NEXT: vpermd %zmm15, %zmm7, %zmm12 {%k1}
+; AVX512DQ-FCP-NEXT: vpermd %zmm15, %zmm7, %zmm13 {%k1}
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm4
@@ -8833,7 +8829,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm8 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm2 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
@@ -8843,7 +8839,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm22 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1}
@@ -8853,7 +8849,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 128(%rax)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 256(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 448(%rax)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 384(%rax)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 576(%rax)
@@ -8862,7 +8858,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 640(%rax)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 832(%rax)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 768(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 960(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 960(%rax)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 896(%rax)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax)
; AVX512DQ-FCP-NEXT: addq $264, %rsp # imm = 0x108
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index dbbfaab9ea26a..6a9e10a5ef122 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -4351,8 +4351,8 @@ define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_1
;
; AVX2-SLOW-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23]
; AVX2-SLOW-NEXT: retq
;
@@ -4365,8 +4365,8 @@ define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_1
;
; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23]
; AVX2-FAST-PERLANE-NEXT: retq
;
@@ -4386,8 +4386,8 @@ define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_1
;
; XOPAVX2-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7]
-; XOPAVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
+; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 6, i32 6, i32 7, i32 5, i32 1, i32 6, i32 4, i32 11, i32 14, i32 14, i32 15, i32 13, i32 9, i32 14, i32 12, i32 11>
@@ -5097,8 +5097,8 @@ define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_1
;
; AVX2-SLOW-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27]
; AVX2-SLOW-NEXT: retq
;
@@ -5111,8 +5111,8 @@ define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_1
;
; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7]
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27]
; AVX2-FAST-PERLANE-NEXT: retq
;
@@ -5132,8 +5132,8 @@ define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_1
;
; XOPAVX2-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7]
-; XOPAVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
+; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7]
; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 7, i32 1, i32 0, i32 2, i32 7, i32 3, i32 13, i32 11, i32 15, i32 9, i32 8, i32 10, i32 15, i32 11, i32 13>
@@ -6152,9 +6152,9 @@ define <16 x i16> @shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_1
;
; AVX2-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; AVX2-NEXT: retq
;
@@ -6176,9 +6176,9 @@ define <16 x i16> @shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_1
;
; XOPAVX2-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15]
+; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
+; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 12, i32 29, i32 30, i32 31, i32 8, i32 9, i32 10, i32 11, i32 12>
@@ -6334,9 +6334,9 @@ define <16 x i16> @shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_1
;
; AVX2-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
; AVX2-NEXT: retq
;
@@ -6359,9 +6359,9 @@ define <16 x i16> @shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_1
;
; XOPAVX2-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15]
+; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
+; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 10, i32 27, i32 28, i32 29, i32 30, i32 31, i32 8, i32 9, i32 10>
@@ -6518,7 +6518,7 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_2
; AVX2-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26:
; AVX2: # %bb.0:
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
; AVX2-NEXT: retq
@@ -6542,7 +6542,7 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_2
; XOPAVX2-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
; XOPAVX2-NEXT: retq
@@ -6596,7 +6596,7 @@ define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_2
; AVX2-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28:
; AVX2: # %bb.0:
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; AVX2-NEXT: retq
@@ -6621,7 +6621,7 @@ define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_2
; XOPAVX2-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; XOPAVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index ea0e3b3a2b9aa..02706b9e1a684 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -4741,14 +4741,14 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.
; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,0,15,0,1,0,1]
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15]
; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1
+; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7]
; AVX512BW-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7]
; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -4758,8 +4758,8 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.
; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,0,15,0,1,0,1]
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15]
; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7]
More information about the llvm-commits
mailing list