[llvm] 32162cf - [X86] lowerV4I64Shuffle - try harder to lower to PERMQ(BLENDD(V1,V2)) pattern
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun May 15 06:58:17 PDT 2022
Author: Simon Pilgrim
Date: 2022-05-15T14:57:58+01:00
New Revision: 32162cf291d40b8ead01061ea68bcdbc79ba9573
URL: https://github.com/llvm/llvm-project/commit/32162cf291d40b8ead01061ea68bcdbc79ba9573
DIFF: https://github.com/llvm/llvm-project/commit/32162cf291d40b8ead01061ea68bcdbc79ba9573.diff
LOG: [X86] lowerV4I64Shuffle - try harder to lower to PERMQ(BLENDD(V1,V2)) pattern
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f80886acce22b..1806e633e989e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -17725,6 +17725,11 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
return V;
+ // Try to lower to PERMQ(BLENDD(V1,V2)).
+ if (SDValue V =
+ lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
+ return V;
+
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle. However, if we have AVX2 and either inputs are already in place,
// we will be able to shuffle even across lanes the other input in a single
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
index f9452dbeb1d4b..279f72873e881 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
@@ -121,22 +121,22 @@ define void @load_i64_stride3_vf4(<12 x i64>* %in.vec, <4 x i64>* %out.vec0, <4
;
; AVX2-LABEL: load_i64_stride3_vf4:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX2-NEXT: vmovdqa (%rdi), %ymm1
-; AVX2-NEXT: vinserti128 $1, 64(%rdi), %ymm0, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,3,2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; AVX2-NEXT: vpbroadcastq 80(%rdi), %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT: vmovaps 32(%rdi), %ymm0
+; AVX2-NEXT: vmovaps (%rdi), %ymm1
+; AVX2-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm2
+; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,3,2,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX2-NEXT: vbroadcastsd 80(%rdi), %ymm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-NEXT: vmovaps 16(%rdi), %xmm1
; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = mem[0,1,0,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-NEXT: vmovdqa %ymm2, (%rsi)
-; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX2-NEXT: vmovaps %ymm2, (%rsi)
+; AVX2-NEXT: vmovaps %ymm0, (%rdx)
; AVX2-NEXT: vmovaps %ymm1, (%rcx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -251,26 +251,26 @@ define void @load_i64_stride3_vf8(<24 x i64>* %in.vec, <8 x i64>* %out.vec0, <8
;
; AVX2-LABEL: load_i64_stride3_vf8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX2-NEXT: vmovdqa (%rdi), %ymm1
-; AVX2-NEXT: vmovdqa 128(%rdi), %ymm2
-; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3
-; AVX2-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm4
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,3,2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm2[4,5],ymm5[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-NEXT: vinserti128 $1, 64(%rdi), %ymm0, %ymm5
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,3,2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; AVX2-NEXT: vpbroadcastq 80(%rdi), %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
-; AVX2-NEXT: vpbroadcastq 176(%rdi), %ymm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-NEXT: vmovaps 32(%rdi), %ymm0
+; AVX2-NEXT: vmovaps (%rdi), %ymm1
+; AVX2-NEXT: vmovaps 128(%rdi), %ymm2
+; AVX2-NEXT: vmovaps 96(%rdi), %ymm3
+; AVX2-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm4
+; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm3[0,3,2,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm2[4,5],ymm5[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm5
+; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm1[0,3,2,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX2-NEXT: vbroadcastsd 80(%rdi), %ymm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
+; AVX2-NEXT: vbroadcastsd 176(%rdi), %ymm2
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX2-NEXT: vmovaps 16(%rdi), %xmm2
; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = mem[0,1,0,3]
@@ -279,10 +279,10 @@ define void @load_i64_stride3_vf8(<24 x i64>* %in.vec, <8 x i64>* %out.vec0, <8
; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = mem[0,1,0,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-NEXT: vmovdqa %ymm5, (%rsi)
-; AVX2-NEXT: vmovdqa %ymm4, 32(%rsi)
-; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
-; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX2-NEXT: vmovaps %ymm5, (%rsi)
+; AVX2-NEXT: vmovaps %ymm4, 32(%rsi)
+; AVX2-NEXT: vmovaps %ymm1, 32(%rdx)
+; AVX2-NEXT: vmovaps %ymm0, (%rdx)
; AVX2-NEXT: vmovaps %ymm3, 32(%rcx)
; AVX2-NEXT: vmovaps %ymm2, (%rcx)
; AVX2-NEXT: vzeroupper
@@ -497,46 +497,46 @@ define void @load_i64_stride3_vf16(<48 x i64>* %in.vec, <16 x i64>* %out.vec0, <
;
; AVX2-LABEL: load_i64_stride3_vf16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa 224(%rdi), %ymm3
-; AVX2-NEXT: vmovdqa 192(%rdi), %ymm6
-; AVX2-NEXT: vmovdqa 320(%rdi), %ymm5
-; AVX2-NEXT: vmovdqa 288(%rdi), %ymm7
-; AVX2-NEXT: vmovdqa 32(%rdi), %ymm8
-; AVX2-NEXT: vmovdqa (%rdi), %ymm9
-; AVX2-NEXT: vmovdqa 128(%rdi), %ymm10
-; AVX2-NEXT: vmovdqa 96(%rdi), %ymm11
-; AVX2-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm11[0,3,2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5],ymm1[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-NEXT: vinserti128 $1, 64(%rdi), %ymm0, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,3,2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5],ymm2[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-NEXT: vinserti128 $1, 352(%rdi), %ymm0, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm7[0,3,2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm4
-; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm6[0,3,2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
-; AVX2-NEXT: vpbroadcastq 272(%rdi), %ymm6
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
-; AVX2-NEXT: vpbroadcastq 176(%rdi), %ymm10
-; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
-; AVX2-NEXT: vpbroadcastq 80(%rdi), %ymm9
-; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm7[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
-; AVX2-NEXT: vpbroadcastq 368(%rdi), %ymm7
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-NEXT: vmovaps 224(%rdi), %ymm3
+; AVX2-NEXT: vmovaps 192(%rdi), %ymm5
+; AVX2-NEXT: vmovaps 320(%rdi), %ymm6
+; AVX2-NEXT: vmovaps 288(%rdi), %ymm7
+; AVX2-NEXT: vmovaps 32(%rdi), %ymm8
+; AVX2-NEXT: vmovaps (%rdi), %ymm9
+; AVX2-NEXT: vmovaps 128(%rdi), %ymm10
+; AVX2-NEXT: vmovaps 96(%rdi), %ymm11
+; AVX2-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,3,2,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5],ymm1[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm1
+; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm9[0,3,2,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5],ymm2[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm2
+; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm7[0,3,2,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm4
+; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm5[0,3,2,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5]
+; AVX2-NEXT: vbroadcastsd 272(%rdi), %ymm5
+; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5]
+; AVX2-NEXT: vbroadcastsd 176(%rdi), %ymm10
+; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5]
+; AVX2-NEXT: vbroadcastsd 80(%rdi), %ymm9
+; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5]
+; AVX2-NEXT: vbroadcastsd 368(%rdi), %ymm7
+; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
; AVX2-NEXT: vmovaps 112(%rdi), %xmm7
; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3]
@@ -553,14 +553,14 @@ define void @load_i64_stride3_vf16(<48 x i64>* %in.vec, <16 x i64>* %out.vec0, <
; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT: vmovdqa %ymm4, 64(%rsi)
-; AVX2-NEXT: vmovdqa %ymm15, 96(%rsi)
-; AVX2-NEXT: vmovdqa %ymm14, (%rsi)
-; AVX2-NEXT: vmovdqa %ymm13, 32(%rsi)
-; AVX2-NEXT: vmovdqa %ymm5, 96(%rdx)
-; AVX2-NEXT: vmovdqa %ymm8, (%rdx)
-; AVX2-NEXT: vmovdqa %ymm6, 32(%rdx)
-; AVX2-NEXT: vmovdqa %ymm3, 64(%rdx)
+; AVX2-NEXT: vmovaps %ymm4, 64(%rsi)
+; AVX2-NEXT: vmovaps %ymm15, 96(%rsi)
+; AVX2-NEXT: vmovaps %ymm14, (%rsi)
+; AVX2-NEXT: vmovaps %ymm13, 32(%rsi)
+; AVX2-NEXT: vmovaps %ymm6, 96(%rdx)
+; AVX2-NEXT: vmovaps %ymm8, (%rdx)
+; AVX2-NEXT: vmovaps %ymm5, 32(%rdx)
+; AVX2-NEXT: vmovaps %ymm3, 64(%rdx)
; AVX2-NEXT: vmovaps %ymm2, 64(%rcx)
; AVX2-NEXT: vmovaps %ymm1, 96(%rcx)
; AVX2-NEXT: vmovaps %ymm0, (%rcx)
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 5bbc6ae159088..80ba405026715 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -1267,14 +1267,14 @@ define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-LABEL: shuffle_v4i64_3254:
; AVX2: # %bb.0:
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v4i64_3254:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_3254:
@@ -1285,8 +1285,8 @@ define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_3254:
; AVX512VL-FAST-PERLANE: # %bb.0:
-; AVX512VL-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; AVX512VL-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX512VL-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 5, i32 4>
ret <4 x i64> %shuffle
@@ -1379,18 +1379,36 @@ define <4 x i64> @shuffle_v4i64_0415(<4 x i64> %a, <4 x i64> %b) {
}
define <4 x i64> @shuffle_v4i64_2741(<4 x i64> %a, <4 x i64> %b) {
-; AVX1OR2-LABEL: shuffle_v4i64_2741:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[0,1]
-; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
-; AVX1OR2-NEXT: retq
+; AVX1-LABEL: shuffle_v4i64_2741:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[0,1]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
+; AVX1-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v4i64_2741:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,7,4,1]
-; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v4i64_2741:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v4i64_2741:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_2741:
+; AVX512VL-FAST-ALL: # %bb.0:
+; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,7,4,1]
+; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-ALL-NEXT: retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_2741:
+; AVX512VL-FAST-PERLANE: # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 7, i32 4, i32 1>
ret <4 x i64> %shuffle
}
@@ -1767,8 +1785,8 @@ define <4 x i64> @shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-LABEL: shuffle_v4i64_1234:
; AVX2: # %bb.0:
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,0]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_1234:
More information about the llvm-commits
mailing list