[llvm] c757780 - [X86] lowerShuffleAsDecomposedShuffleMerge - try to match unpck(permute(x),permute(y)) for v4i32/v2i64 shuffles
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 25 08:25:13 PST 2022
Author: Simon Pilgrim
Date: 2022-11-25T16:24:56Z
New Revision: c757780c62abaf8826b1af200f47b20cdb3d7984
URL: https://github.com/llvm/llvm-project/commit/c757780c62abaf8826b1af200f47b20cdb3d7984
DIFF: https://github.com/llvm/llvm-project/commit/c757780c62abaf8826b1af200f47b20cdb3d7984.diff
LOG: [X86] lowerShuffleAsDecomposedShuffleMerge - try to match unpck(permute(x),permute(y)) for v4i32/v2i64 shuffles
We're using lowerShuffleAsPermuteAndUnpack, which can probably be improved to handle 256/512-bit types pretty easily.
First step towards trying to address the poor vector-shuffle-sse4a.ll pre-SSSE3 codegen mentioned on D127115
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/oddshuffles.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1ce6207c8979..415fa0a0198e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -13181,14 +13181,12 @@ static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- assert(!VT.isFloatingPoint() &&
- "This routine only supports integer vectors.");
- assert(VT.is128BitVector() && "This routine only works on 128-bit vectors.");
- assert(!V2.isUndef() &&
- "This routine should only be used when blending two inputs.");
+ int Size = Mask.size();
assert(Mask.size() >= 2 && "Single element masks are invalid.");
- int Size = Mask.size();
+ // This routine only supports 128-bit integer dual input vectors.
+ if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
+ return SDValue();
int NumLoInputs =
count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
@@ -13466,6 +13464,10 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
DAG))
return BlendPerm;
+ if (VT.getScalarSizeInBits() >= 32)
+ if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
+ DL, VT, V1, V2, Mask, Subtarget, DAG))
+ return PermUnpack;
}
// If the final mask is an alternating blend of vXi8/vXi16, convert to an
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 4e7b011d11f5..c28f30f2a834 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -121,37 +121,19 @@ define void @v5i32(<4 x i32> %a, <4 x i32> %b, ptr %p) nounwind {
;
; SSE42-LABEL: v5i32:
; SSE42: # %bb.0:
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
-; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
-; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,2,2,3]
; SSE42-NEXT: pextrd $3, %xmm0, 16(%rdi)
-; SSE42-NEXT: movdqa %xmm2, (%rdi)
+; SSE42-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE42-NEXT: movdqa %xmm0, (%rdi)
; SSE42-NEXT: retq
;
-; AVX1-LABEL: v5i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; AVX1-NEXT: vpextrd $3, %xmm0, 16(%rdi)
-; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: v5i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; AVX2-NEXT: vpextrd $3, %xmm0, 16(%rdi)
-; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
-; AVX2-NEXT: retq
-;
-; XOP-LABEL: v5i32:
-; XOP: # %bb.0:
-; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7],xmm0[4,5,6,7],xmm1[8,9,10,11]
-; XOP-NEXT: vpextrd $3, %xmm0, 16(%rdi)
-; XOP-NEXT: vmovdqa %xmm1, (%rdi)
-; XOP-NEXT: retq
+; AVX-LABEL: v5i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,2,2,3]
+; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vextractps $3, %xmm0, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm1, (%rdi)
+; AVX-NEXT: retq
%r = shufflevector <4 x i32> %a, <4 x i32> %b, <5 x i32> <i32 0, i32 5, i32 1, i32 6, i32 3>
store <5 x i32> %r, ptr %p
ret void
@@ -509,20 +491,20 @@ define void @v12i32(<8 x i32> %a, <8 x i32> %b, ptr %p) nounwind {
;
; SSE42-LABEL: v12i32:
; SSE42: # %bb.0:
-; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
-; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,0,1]
-; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
-; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5],xmm4[6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,2,2]
-; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4,5],xmm4[6,7]
-; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3],xmm4[4,5,6,7]
+; SSE42-NEXT: movdqa %xmm0, %xmm3
+; SSE42-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,2]
+; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,1]
+; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4,5],xmm3[6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,2]
+; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7]
; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3]
; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
; SSE42-NEXT: movdqa %xmm1, 32(%rdi)
-; SSE42-NEXT: movdqa %xmm4, 16(%rdi)
-; SSE42-NEXT: movdqa %xmm3, (%rdi)
+; SSE42-NEXT: movdqa %xmm3, 16(%rdi)
+; SSE42-NEXT: movdqa %xmm4, (%rdi)
; SSE42-NEXT: retq
;
; AVX1-LABEL: v12i32:
@@ -1691,37 +1673,37 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; SSE42-LABEL: interleave_24i32_in:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqu (%rsi), %xmm0
-; SSE42-NEXT: movdqu 16(%rsi), %xmm4
-; SSE42-NEXT: movdqu (%rdx), %xmm2
-; SSE42-NEXT: movdqu 16(%rdx), %xmm5
-; SSE42-NEXT: movdqu (%rcx), %xmm3
+; SSE42-NEXT: movdqu 16(%rsi), %xmm2
+; SSE42-NEXT: movdqu (%rdx), %xmm3
+; SSE42-NEXT: movdqu 16(%rdx), %xmm4
+; SSE42-NEXT: movdqu (%rcx), %xmm5
; SSE42-NEXT: movdqu 16(%rcx), %xmm6
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1]
-; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,0,1]
-; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3],xmm7[4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1]
+; SSE42-NEXT: movdqa %xmm0, %xmm1
+; SSE42-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,2]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1]
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5],xmm7[6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,2,2]
-; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm4[4,5],xmm7[6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,2,2]
+; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4,5],xmm7[6,7]
; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,1,1]
-; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,1,0,1]
-; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,0,1]
-; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4,5],xmm9[6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,2,2]
-; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm0[4,5],xmm9[6,7]
-; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm3[2,3],xmm9[4,5,6,7]
-; SSE42-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm5[3,3]
-; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
-; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7]
-; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3]
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
+; SSE42-NEXT: movdqa %xmm2, %xmm8
+; SSE42-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
+; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,2]
+; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,1,0,1]
+; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm8[0,1,2,3],xmm9[4,5],xmm8[6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,2,2]
+; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm0[4,5],xmm8[6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,3],xmm8[4,5,6,7]
+; SSE42-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3]
+; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3]
+; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3,4,5],xmm4[6,7]
+; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3]
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7]
; SSE42-NEXT: movdqu %xmm2, 32(%rdi)
-; SSE42-NEXT: movdqu %xmm5, 80(%rdi)
-; SSE42-NEXT: movdqu %xmm9, 16(%rdi)
-; SSE42-NEXT: movdqu %xmm8, 48(%rdi)
+; SSE42-NEXT: movdqu %xmm4, 80(%rdi)
+; SSE42-NEXT: movdqu %xmm8, 16(%rdi)
+; SSE42-NEXT: movdqu %xmm9, 48(%rdi)
; SSE42-NEXT: movdqu %xmm7, 64(%rdi)
; SSE42-NEXT: movdqu %xmm1, (%rdi)
; SSE42-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
index 56d8e32ad449..de449bac13c7 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
@@ -493,10 +493,10 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm11[2],xmm9[2],xmm11[3],xmm9[3]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5
; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm7
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,1,2,2]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[1,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm8
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[1,1,2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[1,2,2,3]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5],ymm10[6,7]
; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm10
@@ -574,10 +574,10 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm6
; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %xmm8
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[0,1,2,2]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[1,2,2,3]
; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm9
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[1,1,2,3]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[1,2,2,3]
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5],ymm12[6,7]
; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %xmm12
@@ -653,10 +653,10 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm11[2],xmm9[2],xmm11[3],xmm9[3]
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm7
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[1,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm8
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[1,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[1,2,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5],ymm10[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm10
@@ -1116,13 +1116,13 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm3
; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm7
; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4
; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7]
; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm10
@@ -1135,9 +1135,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,2,2]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7]
; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm10[0],zero,xmm10[1],zero
@@ -1296,12 +1296,12 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-ALL-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rcx), %xmm3
; AVX2-FAST-ALL-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm6
; AVX2-FAST-ALL-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdx), %xmm8
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,1,2,3]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3]
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7]
; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %xmm3
@@ -1314,9 +1314,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,2,2]
-; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
-; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,2,2,3]
+; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,2,2,3]
+; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5],ymm4[6,7]
; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
@@ -1470,13 +1470,13 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm7
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm10
@@ -1489,9 +1489,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,2,2]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,2,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7]
; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm10[0],zero,xmm10[1],zero
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 4730f5ea724b..3d081f107fca 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -531,44 +531,17 @@ define <4 x i32> @shuffle_v4i32_0145(<4 x i32> %a, <4 x i32> %b) {
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: shuffle_v4i32_0451:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
-; SSE2-NEXT: retq
-;
-; SSE3-LABEL: shuffle_v4i32_0451:
-; SSE3: # %bb.0:
-; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
-; SSE3-NEXT: retq
-;
-; SSSE3-LABEL: shuffle_v4i32_0451:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: shuffle_v4i32_0451:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: shuffle_v4i32_0451:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
-; AVX1-NEXT: retq
+; SSE-LABEL: shuffle_v4i32_0451:
+; SSE: # %bb.0:
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; SSE-NEXT: retq
;
-; AVX2-LABEL: shuffle_v4i32_0451:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
-; AVX2-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v4i32_0451:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i32_0451:
; AVX512VL: # %bb.0:
@@ -593,44 +566,17 @@ define <4 x i32> @shuffle_v4i32_4501(<4 x i32> %a, <4 x i32> %b) {
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: shuffle_v4i32_4015:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
-; SSE2-NEXT: retq
-;
-; SSE3-LABEL: shuffle_v4i32_4015:
-; SSE3: # %bb.0:
-; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
-; SSE3-NEXT: retq
-;
-; SSSE3-LABEL: shuffle_v4i32_4015:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: shuffle_v4i32_4015:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: shuffle_v4i32_4015:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
-; AVX1-NEXT: retq
+; SSE-LABEL: shuffle_v4i32_4015:
+; SSE: # %bb.0:
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; SSE-NEXT: retq
;
-; AVX2-LABEL: shuffle_v4i32_4015:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
-; AVX2-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v4i32_4015:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i32_4015:
; AVX512VL: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 8c196a377da6..21cd0ca1380e 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -3802,8 +3802,7 @@ define <8 x i32> @lowhalf_v8i32(<8 x i32> %x, <8 x i32> %y) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,2,2]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: lowhalf_v8i32:
More information about the llvm-commits
mailing list