[llvm] 5e79ae6 - DAG: Fix vector_shuffle -> splat fold defining undef lanes (#123596)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 21 08:55:54 PST 2025
Author: Matt Arsenault
Date: 2025-01-21T23:55:50+07:00
New Revision: 5e79ae60a67726805fcc27081f67c41cbd8a1e4e
URL: https://github.com/llvm/llvm-project/commit/5e79ae60a67726805fcc27081f67c41cbd8a1e4e
DIFF: https://github.com/llvm/llvm-project/commit/5e79ae60a67726805fcc27081f67c41cbd8a1e4e.diff
LOG: DAG: Fix vector_shuffle -> splat fold defining undef lanes (#123596)
For shuffle vector splats with undef lanes in the mask,
this was introducing real values. Filter out build_vector
results based on the undef elements in the mask.
This avoids AMDGPU test regressions in a future change.
test/CodeGen/X86/urem-seteq-illegal-types.ll looks worse
but I didn't investigate.
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll
llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll
llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll
llvm/test/CodeGen/WebAssembly/simd.ll
llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
llvm/test/CodeGen/X86/vec_smulo.ll
llvm/test/CodeGen/X86/vec_umulo.ll
llvm/test/CodeGen/X86/widen_shuffle-1.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 33f0c8b5555e8e..21d5e0a1b2953d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -26386,9 +26386,17 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
if (AllSame)
return N0;
- // Canonicalize any other splat as a build_vector.
+ // Canonicalize any other splat as a build_vector, but avoid defining any
+ // undefined elements in the mask.
SDValue Splatted = V->getOperand(SplatIndex);
SmallVector<SDValue, 8> Ops(NumElts, Splatted);
+ EVT EltVT = Splatted.getValueType();
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (SVN->getMaskElt(i) < 0)
+ Ops[i] = DAG.getUNDEF(EltVT);
+ }
+
SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops);
// We may have jumped through bitcasts, so the type of the
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll
index b87c969c5bbdf0..1851a34d0e5600 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll
@@ -61,13 +61,10 @@ define void @v_shuffle_v4i64_v3i64__1_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v4i64_v3i64__1_u_u_u:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v4, v2
-; GFX900-NEXT: v_mov_b32_e32 v5, v3
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -75,13 +72,10 @@ define void @v_shuffle_v4i64_v3i64__1_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v3i64__1_u_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -89,13 +83,10 @@ define void @v_shuffle_v4i64_v3i64__1_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-LABEL: v_shuffle_v4i64_v3i64__1_u_u_u:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v4, v2
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -113,10 +104,9 @@ define void @v_shuffle_v4i64_v3i64__2_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v2, v4
-; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: v_mov_b32_e32 v0, v4
+; GFX900-NEXT: v_mov_b32_e32 v1, v5
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -127,10 +117,9 @@ define void @v_shuffle_v4i64_v3i64__2_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -141,10 +130,9 @@ define void @v_shuffle_v4i64_v3i64__2_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v4
-; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -168,13 +156,10 @@ define void @v_shuffle_v4i64_v3i64__4_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v4i64_v3i64__4_u_u_u:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v4, v2
-; GFX900-NEXT: v_mov_b32_e32 v5, v3
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -182,13 +167,10 @@ define void @v_shuffle_v4i64_v3i64__4_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v3i64__4_u_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -196,13 +178,10 @@ define void @v_shuffle_v4i64_v3i64__4_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-LABEL: v_shuffle_v4i64_v3i64__4_u_u_u:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v4, v2
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -221,10 +200,9 @@ define void @v_shuffle_v4i64_v3i64__5_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v2, v4
-; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: v_mov_b32_e32 v0, v4
+; GFX900-NEXT: v_mov_b32_e32 v1, v5
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -235,10 +213,9 @@ define void @v_shuffle_v4i64_v3i64__5_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -249,10 +226,9 @@ define void @v_shuffle_v4i64_v3i64__5_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v4
-; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -544,7 +520,6 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_u(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: v_mov_b32_e32 v2, v4
; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -558,7 +533,6 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -572,7 +546,6 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -875,9 +848,12 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_u(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_mov_b32_e32 v6, 0
+; GFX900-NEXT: v_mov_b32_e32 v0, v4
+; GFX900-NEXT: v_mov_b32_e32 v1, v5
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16
+; GFX900-NEXT: s_nop 0
; GFX900-NEXT: v_mov_b32_e32 v2, v4
; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -889,9 +865,12 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16
+; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -903,9 +882,12 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1
+; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -2449,7 +2431,7 @@ define void @v_shuffle_v4i64_v3i64__u_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: v_mov_b32_e32 v4, v2
; GFX900-NEXT: v_mov_b32_e32 v5, v3
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2463,7 +2445,7 @@ define void @v_shuffle_v4i64_v3i64__u_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: v_mov_b32_e32 v4, v2
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2477,7 +2459,7 @@ define void @v_shuffle_v4i64_v3i64__u_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: v_mov_b32_e32 v4, v2
; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -2650,7 +2632,7 @@ define void @v_shuffle_v4i64_v3i64__3_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: v_mov_b32_e32 v4, v2
; GFX900-NEXT: v_mov_b32_e32 v5, v3
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2664,7 +2646,7 @@ define void @v_shuffle_v4i64_v3i64__3_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: v_mov_b32_e32 v4, v2
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2678,7 +2660,7 @@ define void @v_shuffle_v4i64_v3i64__3_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: v_mov_b32_e32 v4, v2
; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -4691,13 +4673,10 @@ define void @v_shuffle_v4i64_v3i64__1_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v4i64_v3i64__1_3_3_3:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v4, v2
-; GFX900-NEXT: v_mov_b32_e32 v5, v3
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -4705,13 +4684,10 @@ define void @v_shuffle_v4i64_v3i64__1_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v3i64__1_3_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -4719,13 +4695,10 @@ define void @v_shuffle_v4i64_v3i64__1_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-LABEL: v_shuffle_v4i64_v3i64__1_3_3_3:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v4, v2
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -4743,10 +4716,9 @@ define void @v_shuffle_v4i64_v3i64__2_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v2, v4
-; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: v_mov_b32_e32 v0, v4
+; GFX900-NEXT: v_mov_b32_e32 v1, v5
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -4757,10 +4729,9 @@ define void @v_shuffle_v4i64_v3i64__2_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4771,10 +4742,9 @@ define void @v_shuffle_v4i64_v3i64__2_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v4
-; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -5609,7 +5579,7 @@ define void @v_shuffle_v4i64_v3i64__u_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: v_mov_b32_e32 v4, v2
; GFX900-NEXT: v_mov_b32_e32 v5, v3
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -5623,7 +5593,7 @@ define void @v_shuffle_v4i64_v3i64__u_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: v_mov_b32_e32 v4, v2
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5637,7 +5607,7 @@ define void @v_shuffle_v4i64_v3i64__u_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: v_mov_b32_e32 v4, v2
; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -7046,8 +7016,10 @@ define void @v_shuffle_v4i64_v3i64__5_u_5_5(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: v_mov_b32_e32 v2, v4
; GFX900-NEXT: v_mov_b32_e32 v3, v5
+; GFX900-NEXT: v_mov_b32_e32 v0, v4
+; GFX900-NEXT: v_mov_b32_e32 v1, v5
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -7060,8 +7032,10 @@ define void @v_shuffle_v4i64_v3i64__5_u_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -7074,8 +7048,10 @@ define void @v_shuffle_v4i64_v3i64__5_u_5_5(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v5
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -7408,13 +7384,14 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_5(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_u_5:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v6, 0
+; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
+; GFX900-NEXT: s_nop 0
; GFX900-NEXT: v_mov_b32_e32 v2, v4
; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -7422,13 +7399,14 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_u_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
+; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -7436,13 +7414,14 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_5(ptr addrspace(1) inreg %ptr) {
; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_u_5:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
+; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -7807,22 +7786,44 @@ define void @s_shuffle_v4i64_v3i64__0_u_u_u() {
}
define void @s_shuffle_v4i64_v3i64__1_u_u_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__1_u_u_u:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
-; GFX9-NEXT: s_mov_b32 s12, s10
-; GFX9-NEXT: s_mov_b32 s13, s11
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_u_u_u:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s8, s6
+; GFX900-NEXT: s_mov_b32 s9, s7
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_u_u_u:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s8, s6
+; GFX90A-NEXT: s_mov_b32 s9, s7
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4i64_v3i64__1_u_u_u:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s2
+; GFX940-NEXT: s_mov_b32 s9, s3
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=s"()
%shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -7830,59 +7831,99 @@ define void @s_shuffle_v4i64_v3i64__1_u_u_u() {
}
define void @s_shuffle_v4i64_v3i64__2_u_u_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__2_u_u_u:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
- %vec0 = call <3 x i64> asm "; def $0", "=s"()
- %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison>
- call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
- ret void
-}
-
-define void @s_shuffle_v4i64_v3i64__3_u_u_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_u_u_u:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
- %vec0 = call <3 x i64> asm "; def $0", "=s"()
- %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
- call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
- ret void
-}
-
-define void @s_shuffle_v4i64_v3i64__4_u_u_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__4_u_u_u:
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_u_u_u:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_u_u_u:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4i64_v3i64__2_u_u_u:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s4
+; GFX940-NEXT: s_mov_b32 s9, s5
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+ %vec0 = call <3 x i64> asm "; def $0", "=s"()
+ %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison>
+ call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+ ret void
+}
+
+define void @s_shuffle_v4i64_v3i64__3_u_u_u() {
+; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_u_u_u:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
-; GFX9-NEXT: s_mov_b32 s12, s10
-; GFX9-NEXT: s_mov_b32 s13, s11
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
-; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s[8:15]
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec0 = call <3 x i64> asm "; def $0", "=s"()
+ %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
+ call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+ ret void
+}
+
+define void @s_shuffle_v4i64_v3i64__4_u_u_u() {
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_u_u_u:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s8, s6
+; GFX900-NEXT: s_mov_b32 s9, s7
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_u_u_u:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s8, s6
+; GFX90A-NEXT: s_mov_b32 s9, s7
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4i64_v3i64__4_u_u_u:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s2
+; GFX940-NEXT: s_mov_b32 s9, s3
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=s"()
%vec1 = call <3 x i64> asm "; def $0", "=s"()
%shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 4, i32 poison, i32 poison, i32 poison>
@@ -7891,22 +7932,40 @@ define void @s_shuffle_v4i64_v3i64__4_u_u_u() {
}
define void @s_shuffle_v4i64_v3i64__5_u_u_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_u_u_u:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_u_u_u:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_u_u_u:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_u_u_u:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s4
+; GFX940-NEXT: s_mov_b32 s9, s5
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=s"()
%vec1 = call <3 x i64> asm "; def $0", "=s"()
%shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 5, i32 poison, i32 poison, i32 poison>
@@ -8150,22 +8209,50 @@ define void @s_shuffle_v4i64_v3i64__5_4_u_u() {
}
define void @s_shuffle_v4i64_v3i64__5_5_u_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_u_u:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_u:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[8:13]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s8, s12
+; GFX900-NEXT: s_mov_b32 s9, s13
+; GFX900-NEXT: s_mov_b32 s10, s12
+; GFX900-NEXT: s_mov_b32 s11, s13
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_u:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[8:13]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s8, s12
+; GFX90A-NEXT: s_mov_b32 s9, s13
+; GFX90A-NEXT: s_mov_b32 s10, s12
+; GFX90A-NEXT: s_mov_b32 s11, s13
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_u_u:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s4
+; GFX940-NEXT: s_mov_b32 s9, s5
+; GFX940-NEXT: s_mov_b32 s10, s4
+; GFX940-NEXT: s_mov_b32 s11, s5
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=s"()
%vec1 = call <3 x i64> asm "; def $0", "=s"()
%shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 5, i32 5, i32 poison, i32 poison>
@@ -8491,8 +8578,6 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_u() {
; GFX9-NEXT: s_mov_b32 s9, s13
; GFX9-NEXT: s_mov_b32 s10, s12
; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s[8:15]
; GFX9-NEXT: ;;#ASMEND
@@ -8854,22 +8939,56 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_5() {
}
define void @s_shuffle_v4i64_v3i64__u_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__u_0_0_0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s10, s8
-; GFX9-NEXT: s_mov_b32 s11, s9
-; GFX9-NEXT: s_mov_b32 s12, s8
-; GFX9-NEXT: s_mov_b32 s13, s9
-; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_mov_b32 s15, s9
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__u_0_0_0:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s10, s4
+; GFX900-NEXT: s_mov_b32 s11, s5
+; GFX900-NEXT: s_mov_b32 s12, s4
+; GFX900-NEXT: s_mov_b32 s13, s5
+; GFX900-NEXT: s_mov_b32 s14, s4
+; GFX900-NEXT: s_mov_b32 s15, s5
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__u_0_0_0:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s10, s4
+; GFX90A-NEXT: s_mov_b32 s11, s5
+; GFX90A-NEXT: s_mov_b32 s12, s4
+; GFX90A-NEXT: s_mov_b32 s13, s5
+; GFX90A-NEXT: s_mov_b32 s14, s4
+; GFX90A-NEXT: s_mov_b32 s15, s5
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4i64_v3i64__u_0_0_0:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s10, s0
+; GFX940-NEXT: s_mov_b32 s11, s1
+; GFX940-NEXT: s_mov_b32 s12, s0
+; GFX940-NEXT: s_mov_b32 s13, s1
+; GFX940-NEXT: s_mov_b32 s14, s0
+; GFX940-NEXT: s_mov_b32 s15, s1
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=s"()
%shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 poison, i32 0, i32 0, i32 0>
call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -9022,22 +9141,56 @@ define void @s_shuffle_v4i64_v3i64__2_0_0_0() {
}
define void @s_shuffle_v4i64_v3i64__3_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_0_0_0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s10, s8
-; GFX9-NEXT: s_mov_b32 s11, s9
-; GFX9-NEXT: s_mov_b32 s12, s8
-; GFX9-NEXT: s_mov_b32 s13, s9
-; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_mov_b32 s15, s9
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__3_0_0_0:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s10, s4
+; GFX900-NEXT: s_mov_b32 s11, s5
+; GFX900-NEXT: s_mov_b32 s12, s4
+; GFX900-NEXT: s_mov_b32 s13, s5
+; GFX900-NEXT: s_mov_b32 s14, s4
+; GFX900-NEXT: s_mov_b32 s15, s5
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__3_0_0_0:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s10, s4
+; GFX90A-NEXT: s_mov_b32 s11, s5
+; GFX90A-NEXT: s_mov_b32 s12, s4
+; GFX90A-NEXT: s_mov_b32 s13, s5
+; GFX90A-NEXT: s_mov_b32 s14, s4
+; GFX90A-NEXT: s_mov_b32 s15, s5
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4i64_v3i64__3_0_0_0:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s10, s0
+; GFX940-NEXT: s_mov_b32 s11, s1
+; GFX940-NEXT: s_mov_b32 s12, s0
+; GFX940-NEXT: s_mov_b32 s13, s1
+; GFX940-NEXT: s_mov_b32 s14, s0
+; GFX940-NEXT: s_mov_b32 s15, s1
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=s"()
%shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -9964,8 +10117,6 @@ define void @s_shuffle_v4i64_v3i64__u_1_1_1() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
; GFX9-NEXT: s_mov_b32 s12, s10
; GFX9-NEXT: s_mov_b32 s13, s11
; GFX9-NEXT: s_mov_b32 s14, s10
@@ -10054,8 +10205,6 @@ define void @s_shuffle_v4i64_v3i64__3_1_1_1() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
; GFX9-NEXT: s_mov_b32 s12, s10
; GFX9-NEXT: s_mov_b32 s13, s11
; GFX9-NEXT: s_mov_b32 s14, s10
@@ -10968,8 +11117,6 @@ define void @s_shuffle_v4i64_v3i64__u_2_2_2() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
; GFX9-NEXT: s_mov_b32 s10, s12
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: s_mov_b32 s14, s12
@@ -11058,8 +11205,6 @@ define void @s_shuffle_v4i64_v3i64__3_2_2_2() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
; GFX9-NEXT: s_mov_b32 s10, s12
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: s_mov_b32 s14, s12
@@ -11992,22 +12137,44 @@ define void @s_shuffle_v4i64_v3i64__0_3_3_3() {
}
define void @s_shuffle_v4i64_v3i64__1_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__1_3_3_3:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
-; GFX9-NEXT: s_mov_b32 s12, s10
-; GFX9-NEXT: s_mov_b32 s13, s11
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_3_3_3:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s8, s6
+; GFX900-NEXT: s_mov_b32 s9, s7
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_3_3_3:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s8, s6
+; GFX90A-NEXT: s_mov_b32 s9, s7
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4i64_v3i64__1_3_3_3:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s2
+; GFX940-NEXT: s_mov_b32 s9, s3
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=s"()
%shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 1, i32 3, i32 3, i32 3>
call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -12015,22 +12182,40 @@ define void @s_shuffle_v4i64_v3i64__1_3_3_3() {
}
define void @s_shuffle_v4i64_v3i64__2_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__2_3_3_3:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_3_3_3:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_3_3_3:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4i64_v3i64__2_3_3_3:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s4
+; GFX940-NEXT: s_mov_b32 s9, s5
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=s"()
%shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -12898,8 +13083,6 @@ define void @s_shuffle_v4i64_v3i64__u_4_4_4() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
; GFX9-NEXT: s_mov_b32 s12, s10
; GFX9-NEXT: s_mov_b32 s13, s11
; GFX9-NEXT: s_mov_b32 s14, s10
@@ -13871,8 +14054,6 @@ define void @s_shuffle_v4i64_v3i64__u_5_5_5() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
; GFX9-NEXT: s_mov_b32 s10, s12
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: s_mov_b32 s14, s12
@@ -14144,8 +14325,6 @@ define void @s_shuffle_v4i64_v3i64__5_u_5_5() {
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_mov_b32 s8, s12
; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: s_mov_b32 s14, s12
; GFX9-NEXT: s_mov_b32 s15, s13
; GFX9-NEXT: ;;#ASMSTART
@@ -14447,22 +14626,56 @@ define void @s_shuffle_v4i64_v3i64__5_4_5_5() {
}
define void @s_shuffle_v4i64_v3i64__5_5_u_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_u_5:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_5:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[8:13]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s8, s12
+; GFX900-NEXT: s_mov_b32 s9, s13
+; GFX900-NEXT: s_mov_b32 s10, s12
+; GFX900-NEXT: s_mov_b32 s11, s13
+; GFX900-NEXT: s_mov_b32 s14, s12
+; GFX900-NEXT: s_mov_b32 s15, s13
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_5:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[8:13]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s8, s12
+; GFX90A-NEXT: s_mov_b32 s9, s13
+; GFX90A-NEXT: s_mov_b32 s10, s12
+; GFX90A-NEXT: s_mov_b32 s11, s13
+; GFX90A-NEXT: s_mov_b32 s14, s12
+; GFX90A-NEXT: s_mov_b32 s15, s13
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_u_5:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s4
+; GFX940-NEXT: s_mov_b32 s9, s5
+; GFX940-NEXT: s_mov_b32 s10, s4
+; GFX940-NEXT: s_mov_b32 s11, s5
+; GFX940-NEXT: s_mov_b32 s14, s4
+; GFX940-NEXT: s_mov_b32 s15, s5
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=s"()
%vec1 = call <3 x i64> asm "; def $0", "=s"()
%shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 5, i32 5, i32 poison, i32 5>
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll
index 2b46616c87f0dd..7a509ffb8c1591 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll
@@ -61,13 +61,10 @@ define void @v_shuffle_v4p0_v3p0__1_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v4p0_v3p0__1_u_u_u:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v4, v2
-; GFX900-NEXT: v_mov_b32_e32 v5, v3
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -75,13 +72,10 @@ define void @v_shuffle_v4p0_v3p0__1_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v3p0__1_u_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -89,13 +83,10 @@ define void @v_shuffle_v4p0_v3p0__1_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-LABEL: v_shuffle_v4p0_v3p0__1_u_u_u:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v4, v2
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -113,10 +104,9 @@ define void @v_shuffle_v4p0_v3p0__2_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v2, v4
-; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: v_mov_b32_e32 v0, v4
+; GFX900-NEXT: v_mov_b32_e32 v1, v5
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -127,10 +117,9 @@ define void @v_shuffle_v4p0_v3p0__2_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -141,10 +130,9 @@ define void @v_shuffle_v4p0_v3p0__2_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v4
-; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -168,13 +156,10 @@ define void @v_shuffle_v4p0_v3p0__4_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v4p0_v3p0__4_u_u_u:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v4, v2
-; GFX900-NEXT: v_mov_b32_e32 v5, v3
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -182,13 +167,10 @@ define void @v_shuffle_v4p0_v3p0__4_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v3p0__4_u_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -196,13 +178,10 @@ define void @v_shuffle_v4p0_v3p0__4_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-LABEL: v_shuffle_v4p0_v3p0__4_u_u_u:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v4, v2
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -221,10 +200,9 @@ define void @v_shuffle_v4p0_v3p0__5_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v2, v4
-; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: v_mov_b32_e32 v0, v4
+; GFX900-NEXT: v_mov_b32_e32 v1, v5
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -235,10 +213,9 @@ define void @v_shuffle_v4p0_v3p0__5_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -249,10 +226,9 @@ define void @v_shuffle_v4p0_v3p0__5_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v4
-; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -544,7 +520,6 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_u(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: v_mov_b32_e32 v2, v4
; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -558,7 +533,6 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -572,7 +546,6 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -875,9 +848,12 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_u(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_mov_b32_e32 v6, 0
+; GFX900-NEXT: v_mov_b32_e32 v0, v4
+; GFX900-NEXT: v_mov_b32_e32 v1, v5
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16
+; GFX900-NEXT: s_nop 0
; GFX900-NEXT: v_mov_b32_e32 v2, v4
; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -889,9 +865,12 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16
+; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -903,9 +882,12 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1
+; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -2449,7 +2431,7 @@ define void @v_shuffle_v4p0_v3p0__u_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: v_mov_b32_e32 v4, v2
; GFX900-NEXT: v_mov_b32_e32 v5, v3
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2463,7 +2445,7 @@ define void @v_shuffle_v4p0_v3p0__u_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: v_mov_b32_e32 v4, v2
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2477,7 +2459,7 @@ define void @v_shuffle_v4p0_v3p0__u_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: v_mov_b32_e32 v4, v2
; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -2650,7 +2632,7 @@ define void @v_shuffle_v4p0_v3p0__3_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: v_mov_b32_e32 v4, v2
; GFX900-NEXT: v_mov_b32_e32 v5, v3
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2664,7 +2646,7 @@ define void @v_shuffle_v4p0_v3p0__3_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: v_mov_b32_e32 v4, v2
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2678,7 +2660,7 @@ define void @v_shuffle_v4p0_v3p0__3_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: v_mov_b32_e32 v4, v2
; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -4691,13 +4673,10 @@ define void @v_shuffle_v4p0_v3p0__1_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v4p0_v3p0__1_3_3_3:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v4, v2
-; GFX900-NEXT: v_mov_b32_e32 v5, v3
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -4705,13 +4684,10 @@ define void @v_shuffle_v4p0_v3p0__1_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v3p0__1_3_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -4719,13 +4695,10 @@ define void @v_shuffle_v4p0_v3p0__1_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-LABEL: v_shuffle_v4p0_v3p0__1_3_3_3:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v4, v2
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -4743,10 +4716,9 @@ define void @v_shuffle_v4p0_v3p0__2_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v2, v4
-; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: v_mov_b32_e32 v0, v4
+; GFX900-NEXT: v_mov_b32_e32 v1, v5
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -4757,10 +4729,9 @@ define void @v_shuffle_v4p0_v3p0__2_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4771,10 +4742,9 @@ define void @v_shuffle_v4p0_v3p0__2_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v4
-; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -5609,7 +5579,7 @@ define void @v_shuffle_v4p0_v3p0__u_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: v_mov_b32_e32 v4, v2
; GFX900-NEXT: v_mov_b32_e32 v5, v3
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -5623,7 +5593,7 @@ define void @v_shuffle_v4p0_v3p0__u_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: v_mov_b32_e32 v4, v2
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5637,7 +5607,7 @@ define void @v_shuffle_v4p0_v3p0__u_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: v_mov_b32_e32 v4, v2
; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -7046,8 +7016,10 @@ define void @v_shuffle_v4p0_v3p0__5_u_5_5(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: v_mov_b32_e32 v2, v4
; GFX900-NEXT: v_mov_b32_e32 v3, v5
+; GFX900-NEXT: v_mov_b32_e32 v0, v4
+; GFX900-NEXT: v_mov_b32_e32 v1, v5
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -7060,8 +7032,10 @@ define void @v_shuffle_v4p0_v3p0__5_u_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -7074,8 +7048,10 @@ define void @v_shuffle_v4p0_v3p0__5_u_5_5(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v5
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -7408,13 +7384,14 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_5(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_u_5:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v6, 0
+; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
+; GFX900-NEXT: s_nop 0
; GFX900-NEXT: v_mov_b32_e32 v2, v4
; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -7422,13 +7399,14 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_u_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
+; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -7436,13 +7414,14 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_5(ptr addrspace(1) inreg %ptr) {
; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_u_5:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
+; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -7807,22 +7786,44 @@ define void @s_shuffle_v4p0_v3p0__0_u_u_u() {
}
define void @s_shuffle_v4p0_v3p0__1_u_u_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__1_u_u_u:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
-; GFX9-NEXT: s_mov_b32 s12, s10
-; GFX9-NEXT: s_mov_b32 s13, s11
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_u_u_u:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s8, s6
+; GFX900-NEXT: s_mov_b32 s9, s7
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_u_u_u:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s8, s6
+; GFX90A-NEXT: s_mov_b32 s9, s7
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4p0_v3p0__1_u_u_u:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s2
+; GFX940-NEXT: s_mov_b32 s9, s3
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=s"()
%shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -7830,59 +7831,99 @@ define void @s_shuffle_v4p0_v3p0__1_u_u_u() {
}
define void @s_shuffle_v4p0_v3p0__2_u_u_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__2_u_u_u:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
- %vec0 = call <3 x ptr> asm "; def $0", "=s"()
- %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison>
- call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
- ret void
-}
-
-define void @s_shuffle_v4p0_v3p0__3_u_u_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_u_u_u:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
- %vec0 = call <3 x ptr> asm "; def $0", "=s"()
- %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
- call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
- ret void
-}
-
-define void @s_shuffle_v4p0_v3p0__4_u_u_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__4_u_u_u:
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_u_u_u:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_u_u_u:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4p0_v3p0__2_u_u_u:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s4
+; GFX940-NEXT: s_mov_b32 s9, s5
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+ %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+ %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison>
+ call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+ ret void
+}
+
+define void @s_shuffle_v4p0_v3p0__3_u_u_u() {
+; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_u_u_u:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
-; GFX9-NEXT: s_mov_b32 s12, s10
-; GFX9-NEXT: s_mov_b32 s13, s11
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
-; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s[8:15]
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+ %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
+ call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+ ret void
+}
+
+define void @s_shuffle_v4p0_v3p0__4_u_u_u() {
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_u_u_u:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s8, s6
+; GFX900-NEXT: s_mov_b32 s9, s7
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_u_u_u:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s8, s6
+; GFX90A-NEXT: s_mov_b32 s9, s7
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4p0_v3p0__4_u_u_u:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s2
+; GFX940-NEXT: s_mov_b32 s9, s3
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=s"()
%vec1 = call <3 x ptr> asm "; def $0", "=s"()
%shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 4, i32 poison, i32 poison, i32 poison>
@@ -7891,22 +7932,40 @@ define void @s_shuffle_v4p0_v3p0__4_u_u_u() {
}
define void @s_shuffle_v4p0_v3p0__5_u_u_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_u_u_u:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_u_u_u:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_u_u_u:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_u_u_u:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s4
+; GFX940-NEXT: s_mov_b32 s9, s5
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=s"()
%vec1 = call <3 x ptr> asm "; def $0", "=s"()
%shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 5, i32 poison, i32 poison, i32 poison>
@@ -8150,22 +8209,50 @@ define void @s_shuffle_v4p0_v3p0__5_4_u_u() {
}
define void @s_shuffle_v4p0_v3p0__5_5_u_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_u_u:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_u:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[8:13]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s8, s12
+; GFX900-NEXT: s_mov_b32 s9, s13
+; GFX900-NEXT: s_mov_b32 s10, s12
+; GFX900-NEXT: s_mov_b32 s11, s13
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_u:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[8:13]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s8, s12
+; GFX90A-NEXT: s_mov_b32 s9, s13
+; GFX90A-NEXT: s_mov_b32 s10, s12
+; GFX90A-NEXT: s_mov_b32 s11, s13
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_u_u:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s4
+; GFX940-NEXT: s_mov_b32 s9, s5
+; GFX940-NEXT: s_mov_b32 s10, s4
+; GFX940-NEXT: s_mov_b32 s11, s5
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=s"()
%vec1 = call <3 x ptr> asm "; def $0", "=s"()
%shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 5, i32 5, i32 poison, i32 poison>
@@ -8491,8 +8578,6 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_u() {
; GFX9-NEXT: s_mov_b32 s9, s13
; GFX9-NEXT: s_mov_b32 s10, s12
; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s[8:15]
; GFX9-NEXT: ;;#ASMEND
@@ -8854,22 +8939,56 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_5() {
}
define void @s_shuffle_v4p0_v3p0__u_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__u_0_0_0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s10, s8
-; GFX9-NEXT: s_mov_b32 s11, s9
-; GFX9-NEXT: s_mov_b32 s12, s8
-; GFX9-NEXT: s_mov_b32 s13, s9
-; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_mov_b32 s15, s9
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__u_0_0_0:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s10, s4
+; GFX900-NEXT: s_mov_b32 s11, s5
+; GFX900-NEXT: s_mov_b32 s12, s4
+; GFX900-NEXT: s_mov_b32 s13, s5
+; GFX900-NEXT: s_mov_b32 s14, s4
+; GFX900-NEXT: s_mov_b32 s15, s5
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__u_0_0_0:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s10, s4
+; GFX90A-NEXT: s_mov_b32 s11, s5
+; GFX90A-NEXT: s_mov_b32 s12, s4
+; GFX90A-NEXT: s_mov_b32 s13, s5
+; GFX90A-NEXT: s_mov_b32 s14, s4
+; GFX90A-NEXT: s_mov_b32 s15, s5
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4p0_v3p0__u_0_0_0:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s10, s0
+; GFX940-NEXT: s_mov_b32 s11, s1
+; GFX940-NEXT: s_mov_b32 s12, s0
+; GFX940-NEXT: s_mov_b32 s13, s1
+; GFX940-NEXT: s_mov_b32 s14, s0
+; GFX940-NEXT: s_mov_b32 s15, s1
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=s"()
%shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 poison, i32 0, i32 0, i32 0>
call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -9022,22 +9141,56 @@ define void @s_shuffle_v4p0_v3p0__2_0_0_0() {
}
define void @s_shuffle_v4p0_v3p0__3_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_0_0_0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s10, s8
-; GFX9-NEXT: s_mov_b32 s11, s9
-; GFX9-NEXT: s_mov_b32 s12, s8
-; GFX9-NEXT: s_mov_b32 s13, s9
-; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_mov_b32 s15, s9
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__3_0_0_0:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s10, s4
+; GFX900-NEXT: s_mov_b32 s11, s5
+; GFX900-NEXT: s_mov_b32 s12, s4
+; GFX900-NEXT: s_mov_b32 s13, s5
+; GFX900-NEXT: s_mov_b32 s14, s4
+; GFX900-NEXT: s_mov_b32 s15, s5
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__3_0_0_0:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s10, s4
+; GFX90A-NEXT: s_mov_b32 s11, s5
+; GFX90A-NEXT: s_mov_b32 s12, s4
+; GFX90A-NEXT: s_mov_b32 s13, s5
+; GFX90A-NEXT: s_mov_b32 s14, s4
+; GFX90A-NEXT: s_mov_b32 s15, s5
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4p0_v3p0__3_0_0_0:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s10, s0
+; GFX940-NEXT: s_mov_b32 s11, s1
+; GFX940-NEXT: s_mov_b32 s12, s0
+; GFX940-NEXT: s_mov_b32 s13, s1
+; GFX940-NEXT: s_mov_b32 s14, s0
+; GFX940-NEXT: s_mov_b32 s15, s1
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=s"()
%shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -9964,8 +10117,6 @@ define void @s_shuffle_v4p0_v3p0__u_1_1_1() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
; GFX9-NEXT: s_mov_b32 s12, s10
; GFX9-NEXT: s_mov_b32 s13, s11
; GFX9-NEXT: s_mov_b32 s14, s10
@@ -10054,8 +10205,6 @@ define void @s_shuffle_v4p0_v3p0__3_1_1_1() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
; GFX9-NEXT: s_mov_b32 s12, s10
; GFX9-NEXT: s_mov_b32 s13, s11
; GFX9-NEXT: s_mov_b32 s14, s10
@@ -10968,8 +11117,6 @@ define void @s_shuffle_v4p0_v3p0__u_2_2_2() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
; GFX9-NEXT: s_mov_b32 s10, s12
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: s_mov_b32 s14, s12
@@ -11058,8 +11205,6 @@ define void @s_shuffle_v4p0_v3p0__3_2_2_2() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
; GFX9-NEXT: s_mov_b32 s10, s12
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: s_mov_b32 s14, s12
@@ -11992,22 +12137,44 @@ define void @s_shuffle_v4p0_v3p0__0_3_3_3() {
}
define void @s_shuffle_v4p0_v3p0__1_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__1_3_3_3:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
-; GFX9-NEXT: s_mov_b32 s12, s10
-; GFX9-NEXT: s_mov_b32 s13, s11
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_3_3_3:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s8, s6
+; GFX900-NEXT: s_mov_b32 s9, s7
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_3_3_3:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s8, s6
+; GFX90A-NEXT: s_mov_b32 s9, s7
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4p0_v3p0__1_3_3_3:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s2
+; GFX940-NEXT: s_mov_b32 s9, s3
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=s"()
%shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 1, i32 3, i32 3, i32 3>
call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -12015,22 +12182,40 @@ define void @s_shuffle_v4p0_v3p0__1_3_3_3() {
}
define void @s_shuffle_v4p0_v3p0__2_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__2_3_3_3:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_3_3_3:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_3_3_3:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4p0_v3p0__2_3_3_3:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s4
+; GFX940-NEXT: s_mov_b32 s9, s5
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=s"()
%shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -12898,8 +13083,6 @@ define void @s_shuffle_v4p0_v3p0__u_4_4_4() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
; GFX9-NEXT: s_mov_b32 s12, s10
; GFX9-NEXT: s_mov_b32 s13, s11
; GFX9-NEXT: s_mov_b32 s14, s10
@@ -13871,8 +14054,6 @@ define void @s_shuffle_v4p0_v3p0__u_5_5_5() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
; GFX9-NEXT: s_mov_b32 s10, s12
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: s_mov_b32 s14, s12
@@ -14144,8 +14325,6 @@ define void @s_shuffle_v4p0_v3p0__5_u_5_5() {
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_mov_b32 s8, s12
; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: s_mov_b32 s14, s12
; GFX9-NEXT: s_mov_b32 s15, s13
; GFX9-NEXT: ;;#ASMSTART
@@ -14447,22 +14626,56 @@ define void @s_shuffle_v4p0_v3p0__5_4_5_5() {
}
define void @s_shuffle_v4p0_v3p0__5_5_u_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_u_5:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_5:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[8:13]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s8, s12
+; GFX900-NEXT: s_mov_b32 s9, s13
+; GFX900-NEXT: s_mov_b32 s10, s12
+; GFX900-NEXT: s_mov_b32 s11, s13
+; GFX900-NEXT: s_mov_b32 s14, s12
+; GFX900-NEXT: s_mov_b32 s15, s13
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_5:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[8:13]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s8, s12
+; GFX90A-NEXT: s_mov_b32 s9, s13
+; GFX90A-NEXT: s_mov_b32 s10, s12
+; GFX90A-NEXT: s_mov_b32 s11, s13
+; GFX90A-NEXT: s_mov_b32 s14, s12
+; GFX90A-NEXT: s_mov_b32 s15, s13
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_u_5:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s4
+; GFX940-NEXT: s_mov_b32 s9, s5
+; GFX940-NEXT: s_mov_b32 s10, s4
+; GFX940-NEXT: s_mov_b32 s11, s5
+; GFX940-NEXT: s_mov_b32 s14, s4
+; GFX940-NEXT: s_mov_b32 s15, s5
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=s"()
%vec1 = call <3 x ptr> asm "; def $0", "=s"()
%shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 5, i32 5, i32 poison, i32 5>
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll
index 4a036a7868c1a9..95ff0d9a3a9c60 100644
--- a/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll
@@ -3628,15 +3628,15 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
; PWR9LE-LABEL: v2ppcf128_fast:
; PWR9LE: # %bb.0: # %entry
; PWR9LE-NEXT: mflr r0
-; PWR9LE-NEXT: stdu r1, -64(r1)
-; PWR9LE-NEXT: std r0, 80(r1)
+; PWR9LE-NEXT: stdu r1, -48(r1)
+; PWR9LE-NEXT: std r0, 64(r1)
; PWR9LE-NEXT: bl __gcc_qadd
; PWR9LE-NEXT: nop
; PWR9LE-NEXT: stfd f2, 40(r1)
; PWR9LE-NEXT: stfd f1, 32(r1)
; PWR9LE-NEXT: lxv vs1, 32(r1)
; PWR9LE-NEXT: xxswapd vs2, vs1
-; PWR9LE-NEXT: addi r1, r1, 64
+; PWR9LE-NEXT: addi r1, r1, 48
; PWR9LE-NEXT: ld r0, 16(r1)
; PWR9LE-NEXT: mtlr r0
; PWR9LE-NEXT: blr
@@ -3644,15 +3644,15 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
; PWR9BE-LABEL: v2ppcf128_fast:
; PWR9BE: # %bb.0: # %entry
; PWR9BE-NEXT: mflr r0
-; PWR9BE-NEXT: stdu r1, -144(r1)
-; PWR9BE-NEXT: std r0, 160(r1)
+; PWR9BE-NEXT: stdu r1, -128(r1)
+; PWR9BE-NEXT: std r0, 144(r1)
; PWR9BE-NEXT: bl __gcc_qadd
; PWR9BE-NEXT: nop
; PWR9BE-NEXT: stfd f2, 120(r1)
; PWR9BE-NEXT: stfd f1, 112(r1)
; PWR9BE-NEXT: lxv vs1, 112(r1)
; PWR9BE-NEXT: xxswapd vs2, vs1
-; PWR9BE-NEXT: addi r1, r1, 144
+; PWR9BE-NEXT: addi r1, r1, 128
; PWR9BE-NEXT: ld r0, 16(r1)
; PWR9BE-NEXT: mtlr r0
; PWR9BE-NEXT: blr
@@ -3661,13 +3661,13 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
; PWR10LE: # %bb.0: # %entry
; PWR10LE-NEXT: mflr r0
; PWR10LE-NEXT: std r0, 16(r1)
-; PWR10LE-NEXT: stdu r1, -64(r1)
+; PWR10LE-NEXT: stdu r1, -48(r1)
; PWR10LE-NEXT: bl __gcc_qadd at notoc
; PWR10LE-NEXT: stfd f2, 40(r1)
; PWR10LE-NEXT: stfd f1, 32(r1)
; PWR10LE-NEXT: lxv vs1, 32(r1)
; PWR10LE-NEXT: xxswapd vs2, vs1
-; PWR10LE-NEXT: addi r1, r1, 64
+; PWR10LE-NEXT: addi r1, r1, 48
; PWR10LE-NEXT: ld r0, 16(r1)
; PWR10LE-NEXT: mtlr r0
; PWR10LE-NEXT: blr
@@ -3676,14 +3676,14 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
; PWR10BE: # %bb.0: # %entry
; PWR10BE-NEXT: mflr r0
; PWR10BE-NEXT: std r0, 16(r1)
-; PWR10BE-NEXT: stdu r1, -144(r1)
+; PWR10BE-NEXT: stdu r1, -128(r1)
; PWR10BE-NEXT: bl __gcc_qadd
; PWR10BE-NEXT: nop
; PWR10BE-NEXT: stfd f2, 120(r1)
; PWR10BE-NEXT: stfd f1, 112(r1)
; PWR10BE-NEXT: lxv vs1, 112(r1)
; PWR10BE-NEXT: xxswapd vs2, vs1
-; PWR10BE-NEXT: addi r1, r1, 144
+; PWR10BE-NEXT: addi r1, r1, 128
; PWR10BE-NEXT: ld r0, 16(r1)
; PWR10BE-NEXT: mtlr r0
; PWR10BE-NEXT: blr
diff --git a/llvm/test/CodeGen/WebAssembly/simd.ll b/llvm/test/CodeGen/WebAssembly/simd.ll
index 5ec9f6a2a321b3..7228d5335a33f6 100644
--- a/llvm/test/CodeGen/WebAssembly/simd.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd.ll
@@ -481,21 +481,6 @@ define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v16i8:
; NO-SIMD128: .functype shuffle_undef_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: i32.store8 15($0), $2
-; NO-SIMD128-NEXT: i32.store8 14($0), $2
-; NO-SIMD128-NEXT: i32.store8 13($0), $2
-; NO-SIMD128-NEXT: i32.store8 12($0), $2
-; NO-SIMD128-NEXT: i32.store8 11($0), $2
-; NO-SIMD128-NEXT: i32.store8 10($0), $2
-; NO-SIMD128-NEXT: i32.store8 9($0), $2
-; NO-SIMD128-NEXT: i32.store8 8($0), $2
-; NO-SIMD128-NEXT: i32.store8 7($0), $2
-; NO-SIMD128-NEXT: i32.store8 6($0), $2
-; NO-SIMD128-NEXT: i32.store8 5($0), $2
-; NO-SIMD128-NEXT: i32.store8 4($0), $2
-; NO-SIMD128-NEXT: i32.store8 3($0), $2
-; NO-SIMD128-NEXT: i32.store8 2($0), $2
-; NO-SIMD128-NEXT: i32.store8 1($0), $2
; NO-SIMD128-NEXT: i32.store8 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <16 x i8> %x, <16 x i8> %y,
@@ -994,13 +979,6 @@ define <8 x i16> @shuffle_undef_v8i16(<8 x i16> %x, <8 x i16> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v8i16:
; NO-SIMD128: .functype shuffle_undef_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: i32.store16 14($0), $2
-; NO-SIMD128-NEXT: i32.store16 12($0), $2
-; NO-SIMD128-NEXT: i32.store16 10($0), $2
-; NO-SIMD128-NEXT: i32.store16 8($0), $2
-; NO-SIMD128-NEXT: i32.store16 6($0), $2
-; NO-SIMD128-NEXT: i32.store16 4($0), $2
-; NO-SIMD128-NEXT: i32.store16 2($0), $2
; NO-SIMD128-NEXT: i32.store16 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <8 x i16> %x, <8 x i16> %y,
@@ -1288,9 +1266,6 @@ define <4 x i32> @shuffle_undef_v4i32(<4 x i32> %x, <4 x i32> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v4i32:
; NO-SIMD128: .functype shuffle_undef_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: i32.store 12($0), $2
-; NO-SIMD128-NEXT: i32.store 8($0), $2
-; NO-SIMD128-NEXT: i32.store 4($0), $2
; NO-SIMD128-NEXT: i32.store 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <4 x i32> %x, <4 x i32> %y,
@@ -1550,7 +1525,6 @@ define <2 x i64> @shuffle_undef_v2i64(<2 x i64> %x, <2 x i64> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v2i64:
; NO-SIMD128: .functype shuffle_undef_v2i64 (i32, i64, i64, i64, i64) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: i64.store 8($0), $2
; NO-SIMD128-NEXT: i64.store 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <2 x i64> %x, <2 x i64> %y,
@@ -1819,9 +1793,6 @@ define <4 x float> @shuffle_undef_v4f32(<4 x float> %x, <4 x float> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v4f32:
; NO-SIMD128: .functype shuffle_undef_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: f32.store 12($0), $2
-; NO-SIMD128-NEXT: f32.store 8($0), $2
-; NO-SIMD128-NEXT: f32.store 4($0), $2
; NO-SIMD128-NEXT: f32.store 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <4 x float> %x, <4 x float> %y,
@@ -2082,7 +2053,6 @@ define <2 x double> @shuffle_undef_v2f64(<2 x double> %x, <2 x double> %y) {
; NO-SIMD128-LABEL: shuffle_undef_v2f64:
; NO-SIMD128: .functype shuffle_undef_v2f64 (i32, f64, f64, f64, f64) -> ()
; NO-SIMD128-NEXT: # %bb.0:
-; NO-SIMD128-NEXT: f64.store 8($0), $2
; NO-SIMD128-NEXT: f64.store 0($0), $2
; NO-SIMD128-NEXT: return
%res = shufflevector <2 x double> %x, <2 x double> %y,
diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
index 28ac4496acb9be..97cc1f8a156943 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
@@ -141,8 +141,10 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: movl $1463, %eax # imm = 0x5B7
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047]
; SSE2-NEXT: movdqa %xmm0, %xmm3
diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 7e081310c35be5..49cb7c707a14f3 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -474,8 +474,6 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
; SSE2-NEXT: pand %xmm7, %xmm6
; SSE2-NEXT: paddd %xmm8, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSE2-NEXT: pmuludq %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
@@ -548,8 +546,6 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
; SSSE3-NEXT: pand %xmm7, %xmm6
; SSSE3-NEXT: paddd %xmm8, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSSE3-NEXT: pmuludq %xmm2, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
@@ -578,25 +574,23 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmuldq %xmm2, %xmm0
; SSE41-NEXT: pinsrd $3, %r8d, %xmm2
-; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %edx
+; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE41-NEXT: movd %r9d, %xmm4
; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: pmuldq %xmm3, %xmm4
-; SSE41-NEXT: pinsrd $1, %edx, %xmm3
-; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %esi
-; SSE41-NEXT: pinsrd $1, %esi, %xmm5
+; SSE41-NEXT: pinsrd $1, %ecx, %xmm3
+; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %edx
+; SSE41-NEXT: pinsrd $1, %edx, %xmm5
; SSE41-NEXT: pmulld %xmm3, %xmm5
; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm1
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT: movd %edx, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
-; SSE41-NEXT: movd %esi, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SSE41-NEXT: movd %ecx, %xmm3
+; SSE41-NEXT: movd %edx, %xmm6
; SSE41-NEXT: pmuldq %xmm3, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7]
-; SSE41-NEXT: movq %xmm5, 16(%rcx)
+; SSE41-NEXT: movq %xmm5, 16(%rsi)
; SSE41-NEXT: psrad $31, %xmm5
; SSE41-NEXT: pcmpeqd %xmm3, %xmm5
; SSE41-NEXT: pcmpeqd %xmm3, %xmm3
@@ -607,7 +601,7 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
; SSE41-NEXT: pmulld %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, (%rcx)
+; SSE41-NEXT: movdqa %xmm1, (%rsi)
; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
; SSE41-NEXT: pxor %xmm3, %xmm1
diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index 68c6ca93576b76..62db6d234d3019 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -394,8 +394,8 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = mem[0,0,0,0]
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
; SSE2-NEXT: pmuludq %xmm2, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3]
@@ -444,8 +444,8 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = mem[0,0,0,0]
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
; SSSE3-NEXT: pmuludq %xmm2, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3]
@@ -492,9 +492,7 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
; SSE41-NEXT: pxor %xmm6, %xmm3
; SSE41-NEXT: movd %edi, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0]
; SSE41-NEXT: movd %r9d, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0]
; SSE41-NEXT: pmuludq %xmm7, %xmm8
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5],xmm8[6,7]
diff --git a/llvm/test/CodeGen/X86/widen_shuffle-1.ll b/llvm/test/CodeGen/X86/widen_shuffle-1.ll
index 3257936f62e3b7..3d34205096afe0 100644
--- a/llvm/test/CodeGen/X86/widen_shuffle-1.ll
+++ b/llvm/test/CodeGen/X86/widen_shuffle-1.ll
@@ -105,14 +105,13 @@ define void @shuf5(ptr %p) nounwind {
; X86-LABEL: shuf5:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movsd {{.*#+}} xmm0 = [33,33,33,33,33,33,33,33,0,0,0,0,0,0,0,0]
+; X86-NEXT: movsd {{.*#+}} xmm0 = [33,33,u,u,u,u,u,u,0,0,u,u,u,u,u,u]
; X86-NEXT: movsd %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: shuf5:
; X64: # %bb.0:
-; X64-NEXT: movabsq $2387225703656530209, %rax # imm = 0x2121212121212121
-; X64-NEXT: movq %rax, (%rdi)
+; X64-NEXT: movq $8481, (%rdi) # imm = 0x2121
; X64-NEXT: retq
%v = shufflevector <2 x i8> <i8 4, i8 33>, <2 x i8> poison, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
store <8 x i8> %v, ptr %p, align 8
More information about the llvm-commits
mailing list