[llvm] r359400 - [X86][AVX] Combine non-lane crossing binary shuffles using X86ISD::VPERMV3
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun Apr 28 07:31:01 PDT 2019
Author: rksimon
Date: Sun Apr 28 07:31:01 2019
New Revision: 359400
URL: http://llvm.org/viewvc/llvm-project?rev=359400&view=rev
Log:
[X86][AVX] Combine non-lane crossing binary shuffles using X86ISD::VPERMV3
Some of the combines might be further improved if we lower more shuffles with X86ISD::VPERMV3 directly, instead of waiting to combine the results.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
llvm/trunk/test/CodeGen/X86/insertelement-ones.ll
llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll
llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256.ll
llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll
llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll
llvm/trunk/test/CodeGen/X86/vec_smulo.ll
llvm/trunk/test/CodeGen/X86/vec_umulo.ll
llvm/trunk/test/CodeGen/X86/vector-fshl-256.ll
llvm/trunk/test/CodeGen/X86/vector-fshr-256.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v64.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll
llvm/trunk/test/CodeGen/X86/vector-trunc-widen.ll
llvm/trunk/test/CodeGen/X86/vector-trunc.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Apr 28 07:31:01 2019
@@ -31874,6 +31874,28 @@ static SDValue combineX86ShuffleChain(Ar
}
}
+ // If we have a dual input shuffle then lower to VPERMV3.
+ if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
+ ((Subtarget.hasAVX512() &&
+ (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+ MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
+ (Subtarget.hasVLX() &&
+ (MaskVT == MVT::v2f64 || MaskVT == MVT::v2i64 || MaskVT == MVT::v4f64 ||
+ MaskVT == MVT::v4i64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 ||
+ MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
+ (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
+ (Subtarget.hasBWI() && Subtarget.hasVLX() &&
+ (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16)) ||
+ (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
+ (Subtarget.hasVBMI() && Subtarget.hasVLX() &&
+ (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8)))) {
+ SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
+ V1 = DAG.getBitcast(MaskVT, V1);
+ V2 = DAG.getBitcast(MaskVT, V2);
+ Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
+ return DAG.getBitcast(RootVT, Res);
+ }
+
// Failed to find any combines.
return SDValue();
}
Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll Sun Apr 28 07:31:01 2019
@@ -150,11 +150,9 @@ define <8 x i16> @test_masked_z_16xi16_t
define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp) {
; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa (%rdi), %xmm0
-; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1
-; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
+; CHECK-NEXT: vmovdqa (%rdi), %xmm1
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,7,13,3,5,13,3,9]
+; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
@@ -164,12 +162,10 @@ define <8 x i16> @test_masked_16xi16_to_
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
-; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3
-; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
-; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7]
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,13,3,5,13,3,9]
+; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
@@ -181,13 +177,11 @@ define <8 x i16> @test_masked_16xi16_to_
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa (%rdi), %xmm1
-; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
-; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
-; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6],xmm2[7]
+; CHECK-NEXT: vmovdqa (%rdi), %xmm2
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [0,7,13,3,5,13,3,9]
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
@@ -200,12 +194,10 @@ define <8 x i16> @test_masked_16xi16_to_
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
-; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3
-; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13]
-; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3,4,5],xmm3[6,7]
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,15,12,7,1,5,8,14]
+; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
@@ -217,13 +209,11 @@ define <8 x i16> @test_masked_16xi16_to_
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa (%rdi), %xmm1
-; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
-; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13]
-; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3,4,5],xmm2[6,7]
+; CHECK-NEXT: vmovdqa (%rdi), %xmm2
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [3,15,12,7,1,5,8,14]
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
@@ -237,10 +227,10 @@ define <8 x i16> @test_masked_16xi16_to_
; CHECK: # %bb.0:
; CHECK-NEXT: vpbroadcastw 2(%rdi), %xmm2
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3
-; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7]
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm4 = [8,0,3,0,5,0,7,1]
+; CHECK-NEXT: vpermi2w %xmm2, %xmm3, %xmm4
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
@@ -252,12 +242,12 @@ define <8 x i16> @test_masked_16xi16_to_
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpbroadcastw 2(%rdi), %xmm1
-; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
-; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; CHECK-NEXT: vpbroadcastw 2(%rdi), %xmm2
+; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [8,0,3,0,5,0,7,1]
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermi2w %xmm2, %xmm3, %xmm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
@@ -269,9 +259,9 @@ define <8 x i16> @test_masked_z_16xi16_t
define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp) {
; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa (%rdi), %xmm0
-; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
+; CHECK-NEXT: vmovdqa (%rdi), %xmm1
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [9,7,9,6,9,4,3,2]
+; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
@@ -281,10 +271,10 @@ define <8 x i16> @test_masked_16xi16_to_
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
-; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = mem[0],xmm2[1,2,3]
-; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [9,7,9,6,9,4,3,2]
+; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
@@ -296,11 +286,11 @@ define <8 x i16> @test_masked_16xi16_to_
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa (%rdi), %xmm1
-; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1,2,3]
-; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
+; CHECK-NEXT: vmovdqa (%rdi), %xmm2
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [9,7,9,6,9,4,3,2]
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
@@ -1102,9 +1092,10 @@ define <4 x i32> @test_masked_8xi32_to_4
; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
-; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3]
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,0,0,3]
+; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[1,0,0,3]
+; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
@@ -1116,10 +1107,11 @@ define <4 x i32> @test_masked_8xi32_to_4
define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa (%rdi), %xmm1
-; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2,3]
+; CHECK-NEXT: vmovdqa (%rdi), %xmm2
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [5,0,0,3]
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[1,0,0,3]
+; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
@@ -1132,9 +1124,10 @@ define <4 x i32> @test_masked_8xi32_to_4
; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
-; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,7,0]
+; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[0,3,3,0]
+; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
@@ -1146,10 +1139,11 @@ define <4 x i32> @test_masked_8xi32_to_4
define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1
-; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [0,7,7,0]
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[0,3,3,0]
+; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
@@ -1161,9 +1155,9 @@ define <4 x i32> @test_masked_z_8xi32_to
define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp) {
; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,2,3]
-; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm1
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [5,1,2,7]
+; CHECK-NEXT: vpermi2d 16(%rdi), %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
@@ -1172,11 +1166,11 @@ define <4 x i32> @test_8xi32_to_4xi32_pe
define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = mem[1,1,2,3]
-; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm3
-; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
+; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,1,2,7]
+; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
@@ -1188,11 +1182,11 @@ define <4 x i32> @test_masked_8xi32_to_4
define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = mem[1,1,2,3]
; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2
-; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3]
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [5,1,2,7]
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
@@ -1811,12 +1805,12 @@ define <4 x i32> @test_masked_z_16xi32_t
define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) {
; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask9:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,1,0,2]
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpermps %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1,0,2]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vpermd %ymm2, %ymm1, %ymm1
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,3]
+; CHECK-NEXT: vpermi2d %xmm2, %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 12, i32 9, i32 4, i32 10>
@@ -2785,10 +2779,9 @@ define <4 x float> @test_masked_z_8xfloa
define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp) {
; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps (%rdi), %xmm0
; CHECK-NEXT: vmovaps 16(%rdi), %xmm1
-; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,0]
-; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,1]
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [2,6,0,1]
+; CHECK-NEXT: vpermi2ps (%rdi), %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %vp
%res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
@@ -2797,12 +2790,12 @@ define <4 x float> @test_8xfloat_to_4xfl
define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps (%rdi), %xmm2
-; CHECK-NEXT: vmovaps 16(%rdi), %xmm3
-; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[2,0]
-; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
-; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm2[2,0],xmm3[0,1]
+; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
+; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,0,1]
+; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %vp
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
@@ -2814,12 +2807,12 @@ define <4 x float> @test_masked_8xfloat_
define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps (%rdi), %xmm1
; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0]
+; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2,6,0,1]
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
-; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[2,0],xmm2[0,1]
+; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %vp
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
@@ -2832,10 +2825,11 @@ define <4 x float> @test_masked_8xfloat_
; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],mem[3]
-; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} = xmm2[2,3,3,2]
+; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [2,7,7,2]
+; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %vp
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
@@ -2847,11 +2841,12 @@ define <4 x float> @test_masked_8xfloat_
define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps 16(%rdi), %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3]
-; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm1[2,3,3,2]
+; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
+; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2,7,7,2]
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
+; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %vp
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
@@ -2864,11 +2859,11 @@ define <4 x float> @test_masked_8xfloat_
; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %xmm2
-; CHECK-NEXT: vmovaps 16(%rdi), %xmm3
-; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,0]
-; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
-; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm2[3,1],xmm3[2,0]
+; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,1,3,7]
+; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %vp
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
@@ -2880,12 +2875,12 @@ define <4 x float> @test_masked_8xfloat_
define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps (%rdi), %xmm1
-; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,0],xmm1[3,0]
+; CHECK-NEXT: vmovaps (%rdi), %xmm2
+; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,1,3,7]
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
-; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[3,1],xmm2[2,0]
+; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %vp
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
@@ -2897,10 +2892,9 @@ define <4 x float> @test_masked_z_8xfloa
define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp) {
; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps (%rdi), %xmm0
-; CHECK-NEXT: vmovaps 16(%rdi), %xmm1
-; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[3,0]
-; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
+; CHECK-NEXT: vmovaps (%rdi), %xmm1
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,3,5,3]
+; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %vp
%res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
@@ -2910,11 +2904,11 @@ define <4 x float> @test_masked_8xfloat_
; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %xmm2
-; CHECK-NEXT: vmovaps 16(%rdi), %xmm3
-; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,0]
-; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
-; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm2[1,3],xmm3[0,2]
+; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [1,3,5,3]
+; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %vp
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
@@ -2926,12 +2920,12 @@ define <4 x float> @test_masked_8xfloat_
define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps (%rdi), %xmm1
-; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[3,0]
+; CHECK-NEXT: vmovaps (%rdi), %xmm2
+; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [1,3,5,3]
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
-; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3],xmm2[0,2]
+; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %vp
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
@@ -3136,11 +3130,11 @@ define <4 x float> @test_masked_16xfloat
; CHECK: # %bb.0:
; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,2]
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2],xmm0[3]
-; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
-; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [0,6,2,6]
+; CHECK-NEXT: vpermi2ps %xmm0, %xmm3, %xmm4
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcmpeqps %xmm0, %xmm2, %k1
+; CHECK-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
@@ -3153,12 +3147,11 @@ define <4 x float> @test_masked_z_16xflo
; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm2
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,2]
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
-; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,2,6]
+; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
+; CHECK-NEXT: vpermi2ps %xmm3, %xmm2, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
@@ -3203,9 +3196,9 @@ define <4 x float> @test_16xfloat_to_4xf
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm1
-; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm0
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm2
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [2,5,3,7]
+; CHECK-NEXT: vpermi2ps %xmm1, %xmm2, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
@@ -3217,11 +3210,11 @@ define <4 x float> @test_masked_16xfloat
; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,2,4,6,4,6,6,7]
; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm3
; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm0
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3]
-; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
-; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [2,5,3,7]
+; CHECK-NEXT: vpermi2ps %xmm3, %xmm0, %xmm4
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcmpeqps %xmm0, %xmm2, %k1
+; CHECK-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
@@ -3235,12 +3228,11 @@ define <4 x float> @test_masked_z_16xflo
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm2
-; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm0
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [2,5,3,7]
+; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
+; CHECK-NEXT: vpermi2ps %xmm2, %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
@@ -3409,9 +3401,9 @@ define <8 x float> @test_masked_z_16xflo
define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp) {
; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,3,3]
; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = mem[3,1,2,3]
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,7,3]
+; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x float>, <16 x float>* %vp
@@ -3421,12 +3413,12 @@ define <4 x float> @test_16xfloat_to_4xf
define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm2 = mem[0,2,3,3]
-; CHECK-NEXT: vpermpd {{.*#+}} ymm3 = mem[3,1,2,3]
-; CHECK-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3]
-; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
-; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = mem[3,1,2,3]
+; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [0,6,7,3]
+; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x float>, <16 x float>* %vp
@@ -3439,12 +3431,12 @@ define <4 x float> @test_masked_16xfloat
define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = mem[0,2,3,3]
; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = mem[3,1,2,3]
-; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3]
-; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1
-; CHECK-NEXT: vmovaps %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,7,3]
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
+; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x float>, <16 x float>* %vp
Modified: llvm/trunk/test/CodeGen/X86/insertelement-ones.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/insertelement-ones.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/insertelement-ones.ll (original)
+++ llvm/trunk/test/CodeGen/X86/insertelement-ones.ll Sun Apr 28 07:31:01 2019
@@ -296,13 +296,20 @@ define <16 x i16> @insert_v16i16_x12345x
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: retq
;
-; AVX512-LABEL: insert_v16i16_x12345x789ABCDEx:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10,11,12,13],ymm1[14],ymm0[15]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: retq
+; AVX512F-LABEL: insert_v16i16_x12345x789ABCDEx:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10,11,12,13],ymm1[14],ymm0[15]
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: insert_v16i16_x12345x789ABCDEx:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [16,1,2,3,4,5,22,7,8,9,10,11,12,13,14,31]
+; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
+; AVX512VL-NEXT: retq
%1 = insertelement <16 x i16> %a, i16 -1, i32 0
%2 = insertelement <16 x i16> %1, i16 -1, i32 6
%3 = insertelement <16 x i16> %2, i16 -1, i32 15
Modified: llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-256.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-256.ll Sun Apr 28 07:31:01 2019
@@ -47,16 +47,46 @@ define void @shuffle_v16i16_to_v8i16_1(<
; AVX-NEXT: vmovdqa %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v16i16_to_v8i16_1:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v16i16_to_v8i16_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_to_v8i16_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i16_to_v8i16_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7,9,11,13,15]
+; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
+; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
store <8 x i16> %strided.vec, <8 x i16>* %S
@@ -192,12 +222,9 @@ define void @shuffle_v32i8_to_v8i8_2(<32
; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_2:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7,9,11,13,15]
+; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
+; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
@@ -333,12 +360,9 @@ define void @shuffle_v16i16_to_v4i16_1(<
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,5,5,9,9,13,13]
+; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
+; AVX512BWVL-NEXT: vpmovdw %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
@@ -485,12 +509,9 @@ define void @shuffle_v16i16_to_v4i16_3(<
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,7,7,3,11,15,15,11]
+; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
+; AVX512BWVL-NEXT: vpmovdw %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
@@ -607,12 +628,9 @@ define void @shuffle_v32i8_to_v4i8_2(<32
; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_2:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,5,5,9,9,13,13]
+; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
+; AVX512BWVL-NEXT: vpmovdb %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
@@ -843,12 +861,9 @@ define void @shuffle_v32i8_to_v4i8_6(<32
; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_6:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,7,7,3,11,15,15,11]
+; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
+; AVX512BWVL-NEXT: vpmovdb %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
Modified: llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll Sun Apr 28 07:31:01 2019
@@ -24,16 +24,57 @@ define void @shuffle_v32i8_to_v16i8(<32
; AVX-NEXT: vmovdqa %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v32i8_to_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v32i8_to_v16i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v16i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
+;
+; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v16i8:
+; AVX512VBMIVL: # %bb.0:
+; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
+; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1
+; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX512VBMIVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
store <16 x i8> %strided.vec, <16 x i8>* %S
@@ -115,16 +156,54 @@ define void @shuffle_v16i16_to_v8i16(<16
; AVX-NEXT: vmovdqa %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v16i16_to_v8i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
+; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
+; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX512BWVL-NEXT: retq
+;
+; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v8i16:
+; AVX512VBMIVL: # %bb.0:
+; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
+; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
+; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX512VBMIVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
store <8 x i16> %strided.vec, <8 x i16>* %S
@@ -293,16 +372,57 @@ define void @shuffle_v32i8_to_v8i8(<32 x
; AVX-NEXT: vmovq %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v32i8_to_v8i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-NEXT: vmovq %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
+;
+; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8:
+; AVX512VBMIVL: # %bb.0:
+; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2024390091656922112,2024390091656922112]
+; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1
+; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
+; AVX512VBMIVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
store <8 x i8> %strided.vec, <8 x i8>* %S
@@ -1038,23 +1158,17 @@ define void @shuffle_v16i16_to_v4i16(<16
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,4,5,12,13]
+; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
+; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512VBMIVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VBMIVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VBMIVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VBMIVL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,4,5,12,13]
+; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
+; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -1149,16 +1263,57 @@ define void @shuffle_v32i8_to_v4i8(<32 x
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v32i8_to_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT: vmovd %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
+;
+; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8:
+; AVX512VBMIVL: # %bb.0:
+; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VBMIVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [403703808,403703808,403703808,403703808]
+; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1
+; AVX512VBMIVL-NEXT: vmovd %xmm1, (%rsi)
+; AVX512VBMIVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
store <4 x i8> %strided.vec, <4 x i8>* %S
Modified: llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256.ll Sun Apr 28 07:31:01 2019
@@ -24,16 +24,57 @@ define void @shuffle_v32i8_to_v16i8(<32
; AVX-NEXT: vmovdqa %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v32i8_to_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v32i8_to_v16i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v16i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
+;
+; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v16i8:
+; AVX512VBMIVL: # %bb.0:
+; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
+; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1
+; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX512VBMIVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
store <16 x i8> %strided.vec, <16 x i8>* %S
@@ -115,16 +156,54 @@ define void @shuffle_v16i16_to_v8i16(<16
; AVX-NEXT: vmovdqa %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v16i16_to_v8i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
+; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
+; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX512BWVL-NEXT: retq
+;
+; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v8i16:
+; AVX512VBMIVL: # %bb.0:
+; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
+; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
+; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX512VBMIVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
store <8 x i16> %strided.vec, <8 x i16>* %S
@@ -329,23 +408,17 @@ define void @shuffle_v32i8_to_v8i8(<32 x
; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
+; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
+; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512VBMIVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VBMIVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VBMIVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VBMIVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
+; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
+; AVX512VBMIVL-NEXT: vpmovwb %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
Modified: llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll Sun Apr 28 07:31:01 2019
@@ -855,11 +855,11 @@ define <4 x double> @PR34175(<32 x i16>*
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0
; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,2,3]
+; AVX512VL-NEXT: vpermi2d %xmm1, %xmm0, %xmm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512VL-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll Sun Apr 28 07:31:01 2019
@@ -846,11 +846,11 @@ define <4 x double> @PR34175(<32 x i16>*
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0
; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,2,3]
+; AVX512VL-NEXT: vpermi2d %xmm1, %xmm0, %xmm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512VL-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/vec_smulo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_smulo.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_smulo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_smulo.ll Sun Apr 28 07:31:01 2019
@@ -407,15 +407,15 @@ define <3 x i32> @smulo_v3i32(<3 x i32>
;
; AVX512-LABEL: smulo_v3i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX512-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm3
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
+; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512-NEXT: vpmuldq %xmm3, %xmm4, %xmm3
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
+; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0
-; AVX512-NEXT: vpcmpneqd %xmm0, %xmm2, %k1
+; AVX512-NEXT: vpcmpneqd %xmm0, %xmm4, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi)
@@ -537,15 +537,15 @@ define <4 x i32> @smulo_v4i32(<4 x i32>
;
; AVX512-LABEL: smulo_v4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX512-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm3
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
+; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512-NEXT: vpmuldq %xmm3, %xmm4, %xmm3
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
+; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0
-; AVX512-NEXT: vpcmpneqd %xmm0, %xmm2, %k1
+; AVX512-NEXT: vpcmpneqd %xmm0, %xmm4, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
@@ -796,15 +796,15 @@ define <6 x i32> @smulo_v6i32(<6 x i32>
;
; AVX512-LABEL: smulo_v6i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
-; AVX512-NEXT: vpmuldq %ymm2, %ymm3, %ymm2
-; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm3
-; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
+; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7]
+; AVX512-NEXT: vpmuldq %ymm3, %ymm4, %ymm3
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15]
+; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4
; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1
; AVX512-NEXT: vpsrad $31, %ymm1, %ymm0
-; AVX512-NEXT: vpcmpneqd %ymm0, %ymm2, %k1
+; AVX512-NEXT: vpcmpneqd %ymm0, %ymm4, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
@@ -995,15 +995,15 @@ define <8 x i32> @smulo_v8i32(<8 x i32>
;
; AVX512-LABEL: smulo_v8i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
-; AVX512-NEXT: vpmuldq %ymm2, %ymm3, %ymm2
-; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm3
-; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
+; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7]
+; AVX512-NEXT: vpmuldq %ymm3, %ymm4, %ymm3
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15]
+; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4
; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1
; AVX512-NEXT: vpsrad $31, %ymm1, %ymm0
-; AVX512-NEXT: vpcmpneqd %ymm0, %ymm2, %k1
+; AVX512-NEXT: vpcmpneqd %ymm0, %ymm4, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512-NEXT: vmovdqa %ymm1, (%rdi)
@@ -2103,19 +2103,19 @@ define <4 x i32> @smulo_v4i24(<4 x i24>
;
; AVX512-LABEL: smulo_v4i24:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $8, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0
; AVX512-NEXT: vpslld $8, %xmm1, %xmm1
; AVX512-NEXT: vpsrad $8, %xmm1, %xmm1
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX512-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm3
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
+; AVX512-NEXT: vpslld $8, %xmm0, %xmm0
+; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0
+; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512-NEXT: vpmuldq %xmm3, %xmm4, %xmm3
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
+; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0
-; AVX512-NEXT: vpcmpneqd %xmm0, %xmm2, %k0
+; AVX512-NEXT: vpcmpneqd %xmm0, %xmm4, %k0
; AVX512-NEXT: vpslld $8, %xmm1, %xmm0
; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0
; AVX512-NEXT: vpcmpneqd %xmm1, %xmm0, %k1
Modified: llvm/trunk/test/CodeGen/X86/vec_umulo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_umulo.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_umulo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_umulo.ll Sun Apr 28 07:31:01 2019
@@ -367,13 +367,13 @@ define <3 x i32> @umulo_v3i32(<3 x i32>
;
; AVX512-LABEL: umulo_v3i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX512-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm3
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
-; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k1
+; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
+; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4
+; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
@@ -483,13 +483,13 @@ define <4 x i32> @umulo_v4i32(<4 x i32>
;
; AVX512-LABEL: umulo_v4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX512-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm3
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
-; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k1
+; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
+; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4
+; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
@@ -703,13 +703,13 @@ define <6 x i32> @umulo_v6i32(<6 x i32>
;
; AVX512-LABEL: umulo_v6i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
-; AVX512-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
-; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm3
-; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
-; AVX512-NEXT: vptestmd %ymm2, %ymm2, %k1
+; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7]
+; AVX512-NEXT: vpmuludq %ymm3, %ymm4, %ymm3
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15]
+; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4
+; AVX512-NEXT: vptestmd %ymm4, %ymm4, %k1
; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
@@ -873,13 +873,13 @@ define <8 x i32> @umulo_v8i32(<8 x i32>
;
; AVX512-LABEL: umulo_v8i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
-; AVX512-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
-; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm3
-; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
-; AVX512-NEXT: vptestmd %ymm2, %ymm2, %k1
+; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7]
+; AVX512-NEXT: vpmuludq %ymm3, %ymm4, %ymm3
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15]
+; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4
+; AVX512-NEXT: vptestmd %ymm4, %ymm4, %k1
; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
@@ -1878,17 +1878,17 @@ define <4 x i32> @umulo_v4i24(<4 x i24>
; AVX512-LABEL: umulo_v4i24:
; AVX512: # %bb.0:
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
-; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX512-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm3
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
+; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
+; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpsrld $24, %xmm1, %xmm0
-; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpor %xmm4, %xmm0, %xmm0
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
Modified: llvm/trunk/test/CodeGen/X86/vector-fshl-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-fshl-256.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-fshl-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-fshl-256.ll Sun Apr 28 07:31:01 2019
@@ -1961,9 +1961,10 @@ define <16 x i16> @constant_funnnel_v16i
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm1
; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm2
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512VLBW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
-; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512VLBW-NEXT: vpermi2w %ymm0, %ymm2, %ymm1
+; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: constant_funnnel_v16i16:
Modified: llvm/trunk/test/CodeGen/X86/vector-fshr-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-fshr-256.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-fshr-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-fshr-256.ll Sun Apr 28 07:31:01 2019
@@ -1971,9 +1971,9 @@ define <16 x i16> @constant_funnnel_v16i
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm2
; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
-; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512VLBW-NEXT: vpor %ymm2, %ymm0, %ymm2
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm0 = [16,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512VLBW-NEXT: vpermi2w %ymm1, %ymm2, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: constant_funnnel_v16i16:
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll Sun Apr 28 07:31:01 2019
@@ -286,11 +286,24 @@ define <16 x i8> @shuffle_v16i8_16_00_16
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT: retq
+;
+; AVX512VLBW-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16,0,17,0,18,0,19,0,20,0,21,0,22,0,23]
+; AVX512VLVBMI-NEXT: vpermi2b %xmm0, %xmm1, %xmm2
+; AVX512VLVBMI-NEXT: vmovdqa %xmm2, %xmm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7>
ret <16 x i8> %shuffle
}
@@ -352,11 +365,23 @@ define <16 x i8> @shuffle_v16i8_03_02_01
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
; SSE41-NEXT: retq
;
-; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
-; AVX: # %bb.0:
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
-; AVX-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VLBW-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,19,18,17,16,23,22,21,20]
+; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20>
ret <16 x i8> %shuffle
}
@@ -394,12 +419,25 @@ define <16 x i8> @shuffle_v16i8_03_02_01
; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE41-NEXT: retq
;
-; AVX-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
+; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
+; AVX1OR2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VLBW-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
+; AVX512VLBW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,31,30,29,28,11,10,9,8,23,22,21,20]
+; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 31, i32 30, i32 29, i32 28, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20>
ret <16 x i8> %shuffle
}
@@ -1136,11 +1174,23 @@ define <16 x i8> @shuffle_v16i8_00_16_01
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE-NEXT: retq
;
-; AVX-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
-; AVX: # %bb.0:
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VLBW-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512VLBW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16,1,17,4,20,5,21,2,18,3,19,6,22,7,23]
+; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <16 x i8> %val1, <16 x i8> %val2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23>
ret <16 x i8> %shuffle
}
@@ -1381,12 +1431,25 @@ define <16 x i8> @shuffe_v16i8_shift_00_
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1OR2-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT: retq
+;
+; AVX512VLBW-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
+; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
+; AVX512VLVBMI-NEXT: retq
%1 = lshr <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%2 = lshr <8 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%3 = bitcast <8 x i16> %1 to <16 x i8>
@@ -1647,13 +1710,27 @@ define <16 x i8> @PR12412(<16 x i8> %inv
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-NEXT: retq
;
-; AVX-LABEL: PR12412:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT: retq
+; AVX1OR2-LABEL: PR12412:
+; AVX1OR2: # %bb.0: # %entry
+; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1OR2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VLBW-LABEL: PR12412:
+; AVX512VLBW: # %bb.0: # %entry
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: PR12412:
+; AVX512VLVBMI: # %bb.0: # %entry
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
+; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
+; AVX512VLVBMI-NEXT: retq
entry:
%0 = shufflevector <16 x i8> %inval1, <16 x i8> %inval2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
ret <16 x i8> %0
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll Sun Apr 28 07:31:01 2019
@@ -360,11 +360,17 @@ define <4 x i32> @shuffle_v4i32_0124(<4
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v4i32_0124:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vbroadcastss %xmm1, %xmm1
-; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v4i32_0124:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v4i32_0124:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,4]
+; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
ret <4 x i32> %shuffle
}
@@ -401,12 +407,18 @@ define <4 x i32> @shuffle_v4i32_0142(<4
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v4i32_0142:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vbroadcastss %xmm1, %xmm1
-; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2]
-; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v4i32_0142:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2]
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v4i32_0142:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,2]
+; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
ret <4 x i32> %shuffle
}
@@ -446,12 +458,18 @@ define <4 x i32> @shuffle_v4i32_0412(<4
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v4i32_0412:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vbroadcastss %xmm1, %xmm1
-; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
-; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v4i32_0412:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v4i32_0412:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,1,2]
+; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
ret <4 x i32> %shuffle
}
@@ -483,11 +501,17 @@ define <4 x i32> @shuffle_v4i32_4012(<4
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
-; AVX-LABEL: shuffle_v4i32_4012:
-; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,2]
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v4i32_4012:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,2]
+; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v4i32_4012:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,1,2]
+; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
ret <4 x i32> %shuffle
}
@@ -537,12 +561,18 @@ define <4 x i32> @shuffle_v4i32_0451(<4
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v4i32_0451:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX2OR512VL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v4i32_0451:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v4i32_0451:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,5,1]
+; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
ret <4 x i32> %shuffle
}
@@ -593,12 +623,18 @@ define <4 x i32> @shuffle_v4i32_4015(<4
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v4i32_4015:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
-; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v4i32_4015:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v4i32_4015:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,1,5]
+; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
ret <4 x i32> %shuffle
}
@@ -1841,16 +1877,10 @@ define <4 x float> @shuffle_v4f32_bitcas
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX1OR2-LABEL: shuffle_v4f32_bitcast_4401:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[0,1]
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v4f32_bitcast_4401:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm1
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: retq
+; AVX-LABEL: shuffle_v4f32_bitcast_4401:
+; AVX: # %bb.0:
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[0,1]
+; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
%2 = bitcast <4 x i32> %1 to <2 x double>
%3 = bitcast <4 x float> %a to <2 x double>
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll Sun Apr 28 07:31:01 2019
@@ -974,11 +974,23 @@ define <8 x i16> @shuffle_v8i16_0c1d2e3f
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: retq
;
-; AVX-LABEL: shuffle_v8i16_0c1d2e3f:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8i16_0c1d2e3f:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_0c1d2e3f:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_0c1d2e3f:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,12,1,13,2,14,3,15]
+; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 2, i32 14, i32 3, i32 15>
ret <8 x i16> %shuffle
}
@@ -1004,11 +1016,23 @@ define <8 x i16> @shuffle_v8i16_48596a7b
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: retq
;
-; AVX-LABEL: shuffle_v8i16_48596a7b:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8i16_48596a7b:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_48596a7b:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_48596a7b:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,8,5,9,6,10,7,11]
+; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 8, i32 5, i32 9, i32 6, i32 10, i32 7, i32 11>
ret <8 x i16> %shuffle
}
@@ -1021,12 +1045,18 @@ define <8 x i16> @shuffle_v8i16_08196e7f
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: retq
;
-; AVX-LABEL: shuffle_v8i16_08196e7f:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8i16_08196e7f:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8i16_08196e7f:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8,1,9,6,14,7,15]
+; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 6, i32 14, i32 7, i32 15>
ret <8 x i16> %shuffle
}
@@ -1039,12 +1069,18 @@ define <8 x i16> @shuffle_v8i16_0c1d6879
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: retq
;
-; AVX-LABEL: shuffle_v8i16_0c1d6879:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,0,2,3]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8i16_0c1d6879:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,0,2,3]
+; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8i16_0c1d6879:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,12,1,13,6,8,7,9]
+; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 6, i32 8, i32 7, i32 9>
ret <8 x i16> %shuffle
}
@@ -1077,18 +1113,11 @@ define <8 x i16> @shuffle_v8i16_109832ba
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,6,7,2,3,12,13,8,9,14,15,10,11]
; AVX2-FAST-NEXT: retq
;
-; AVX512VL-SLOW-LABEL: shuffle_v8i16_109832ba:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v8i16_109832ba:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,6,7,2,3,12,13,8,9,14,15,10,11]
-; AVX512VL-FAST-NEXT: retq
+; AVX512VL-LABEL: shuffle_v8i16_109832ba:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,9,8,3,2,11,10]
+; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 9, i32 8, i32 3, i32 2, i32 11, i32 10>
ret <8 x i16> %shuffle
}
@@ -1165,19 +1194,11 @@ define <8 x i16> @shuffle_v8i16_0213cedf
; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-FAST-NEXT: retq
;
-; AVX512VL-SLOW-LABEL: shuffle_v8i16_0213cedf:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7]
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v8i16_0213cedf:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,12,13,10,11,14,15]
-; AVX512VL-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; AVX512VL-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-FAST-NEXT: retq
+; AVX512VL-LABEL: shuffle_v8i16_0213cedf:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,1,3,12,14,13,15]
+; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 12, i32 14, i32 13, i32 15>
ret <8 x i16> %shuffle
}
@@ -1227,18 +1248,11 @@ define <8 x i16> @shuffle_v8i16_443aXXXX
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,8,9,10,11,12,13,14,15]
; AVX2-FAST-NEXT: retq
;
-; AVX512VL-SLOW-LABEL: shuffle_v8i16_443aXXXX:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v8i16_443aXXXX:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
-; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,8,9,10,11,12,13,14,15]
-; AVX512VL-FAST-NEXT: retq
+; AVX512VL-LABEL: shuffle_v8i16_443aXXXX:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,4,3,10,4,5,6,7]
+; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 3, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
@@ -1272,11 +1286,17 @@ define <8 x i16> @shuffle_v8i16_032dXXXX
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v8i16_032dXXXX:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v8i16_032dXXXX:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8i16_032dXXXX:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,3,2,13,0,13,0,1]
+; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 3, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
@@ -1317,11 +1337,23 @@ define <8 x i16> @shuffle_v8i16_012dXXXX
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
-; AVX-LABEL: shuffle_v8i16_012dXXXX:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8i16_012dXXXX:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_012dXXXX:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_012dXXXX:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,13,4,5,6,7]
+; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
@@ -1356,11 +1388,24 @@ define <8 x i16> @shuffle_v8i16_XXXXcde3
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v8i16_XXXXcde3:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpbroadcastq %xmm0, %xmm0
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v8i16_XXXXcde3:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
+; AVX2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_XXXXcde3:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_XXXXcde3:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,11]
+; AVX512VL-FAST-NEXT: vpermi2w %xmm0, %xmm1, %xmm2
+; AVX512VL-FAST-NEXT: vmovdqa %xmm2, %xmm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 3>
ret <8 x i16> %shuffle
}
@@ -1389,11 +1434,24 @@ define <8 x i16> @shuffle_v8i16_cde3XXXX
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
-; AVX-LABEL: shuffle_v8i16_cde3XXXX:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
-; AVX-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8i16_cde3XXXX:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_cde3XXXX:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_cde3XXXX:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,6,11,0,1,2,3]
+; AVX512VL-FAST-NEXT: vpermi2w %xmm0, %xmm1, %xmm2
+; AVX512VL-FAST-NEXT: vmovdqa %xmm2, %xmm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 13, i32 14, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
@@ -1430,11 +1488,17 @@ define <8 x i16> @shuffle_v8i16_012dcde3
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v8i16_012dcde3:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v8i16_012dcde3:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8i16_012dcde3:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,13,12,13,14,3]
+; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 12, i32 13, i32 14, i32 3>
ret <8 x i16> %shuffle
}
@@ -1520,20 +1584,11 @@ define <8 x i16> @shuffle_v8i16_XXX1X579
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
; AVX2-FAST-NEXT: retq
;
-; AVX512VL-SLOW-LABEL: shuffle_v8i16_XXX1X579:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
-; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v8i16_XXX1X579:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,2,3,8,9,10,11,14,15,14,15]
-; AVX512VL-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; AVX512VL-FAST-NEXT: retq
+; AVX512VL-LABEL: shuffle_v8i16_XXX1X579:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,1,4,5,7,9]
+; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 5, i32 7, i32 9>
ret <8 x i16> %shuffle
}
@@ -1568,12 +1623,18 @@ define <8 x i16> @shuffle_v8i16_XX4X8acX
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v8i16_XX4X8acX:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v8i16_XX4X8acX:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8i16_XX4X8acX:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,8,10,12,10]
+; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 undef>
ret <8 x i16> %shuffle
}
@@ -2449,12 +2510,19 @@ define <8 x i16> @shuffle_v8i16_fu3ucc5u
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: shuffle_v8i16_fu3ucc5u:
-; AVX: # %bb.0:
-; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,4]
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8i16_fu3ucc5u:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX1OR2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,4]
+; AVX1OR2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8i16_fu3ucc5u:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,5,11,12,4,4,13,14]
+; AVX512VL-NEXT: vpermi2w %xmm0, %xmm1, %xmm2
+; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 15, i32 undef, i32 3, i32 undef, i32 12, i32 12, i32 5, i32 undef>
ret <8 x i16> %shuffle
}
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll Sun Apr 28 07:31:01 2019
@@ -1005,11 +1005,17 @@ define <16 x i16> @shuffle_v16i16_00_01_
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,31]
+; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
ret <16 x i16> %shuffle
}
@@ -1023,11 +1029,17 @@ define <16 x i16> @shuffle_v16i16_16_01_
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [16,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i16> %shuffle
}
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll Sun Apr 28 07:31:01 2019
@@ -2978,12 +2978,25 @@ define <32 x i8> @shuffle_v32i8_15_15_15
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,14,14,15,15]
-; AVX2OR512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,14,14,15,15]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,14,14,15,15]
+; AVX512VLBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,16,16,16,16,16,16,16,16]
+; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
+; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <32 x i8> %shuffle
}
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll Sun Apr 28 07:31:01 2019
@@ -318,22 +318,40 @@ define <4 x double> @shuffle_v4f64_0213(
}
define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) {
-; ALL-LABEL: shuffle_v4f64_0423:
-; ALL: # %bb.0:
-; ALL-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
-; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v4f64_0423:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v4f64_0423:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v4f64_0423:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,4,2,3]
+; AVX512VL-FAST-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
ret <4 x double> %shuffle
}
define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) {
-; ALL-LABEL: shuffle_v4f64_0462:
-; ALL: # %bb.0:
-; ALL-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v4f64_0462:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v4f64_0462:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovapd {{.*#+}} ymm2 = [0,4,6,2]
+; AVX512VL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 6, i32 2>
ret <4 x double> %shuffle
}
@@ -483,11 +501,23 @@ define <4 x double> @shuffle_v4f64_3276(
}
define <4 x double> @shuffle_v4f64_1076(<4 x double> %a, <4 x double> %b) {
-; ALL-LABEL: shuffle_v4f64_1076:
-; ALL: # %bb.0:
-; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
-; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v4f64_1076:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX1OR2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v4f64_1076:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX512VL-SLOW-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v4f64_1076:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [1,0,7,6]
+; AVX512VL-FAST-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
ret <4 x double> %shuffle
}
@@ -906,12 +936,19 @@ define <4 x i64> @shuffle_v4i64_0142(<4
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v4i64_0142:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,2]
-; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX512VL-NEXT: retq
+; AVX512VL-SLOW-LABEL: shuffle_v4i64_0142:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,2]
+; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v4i64_0142:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,2]
+; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
ret <4 x i64> %shuffle
}
@@ -1185,11 +1222,17 @@ define <4 x i64> @shuffle_v4i64_1076(<4
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v4i64_1076:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
-; AVX512VL-NEXT: retq
+; AVX512VL-SLOW-LABEL: shuffle_v4i64_1076:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v4i64_1076:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,7,6]
+; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
ret <4 x i64> %shuffle
}
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll Sun Apr 28 07:31:01 2019
@@ -283,11 +283,23 @@ define <8 x float> @shuffle_v8f32_080808
}
define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) {
-; ALL-LABEL: shuffle_v8f32_08084c4c:
-; ALL: # %bb.0:
-; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8f32_08084c4c:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
+; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8f32_08084c4c:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
+; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8f32_08084c4c:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,8,0,8,4,12,4,12]
+; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
ret <8 x float> %shuffle
}
@@ -933,11 +945,23 @@ define <8 x float> @shuffle_v8f32_3210ba
}
define <8 x float> @shuffle_v8f32_3210fedc(<8 x float> %a, <8 x float> %b) {
-; ALL-LABEL: shuffle_v8f32_3210fedc:
-; ALL: # %bb.0:
-; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8f32_3210fedc:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8f32_3210fedc:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8f32_3210fedc:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12]
+; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12>
ret <8 x float> %shuffle
}
@@ -1017,11 +1041,24 @@ define <8 x float> @PR21138(<8 x float>
}
define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) {
-; ALL-LABEL: shuffle_v8f32_ba987654:
-; ALL: # %bb.0:
-; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8f32_ba987654:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8f32_ba987654:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8f32_ba987654:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12]
+; AVX512VL-FAST-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2
+; AVX512VL-FAST-NEXT: vmovaps %ymm2, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
ret <8 x float> %shuffle
}
@@ -1468,11 +1505,23 @@ define <8 x i32> @shuffle_v8i32_08084c4c
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v8i32_08084c4c:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
-; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v8i32_08084c4c:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i32_08084c4c:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i32_08084c4c:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,8,0,8,4,12,4,12]
+; AVX512VL-FAST-NEXT: vpermt2d %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
ret <8 x i32> %shuffle
}
@@ -2188,11 +2237,23 @@ define <8 x i32> @shuffle_v8i32_3210ba98
}
define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) {
-; ALL-LABEL: shuffle_v8i32_3210fedc:
-; ALL: # %bb.0:
-; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8i32_3210fedc:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i32_3210fedc:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i32_3210fedc:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12]
+; AVX512VL-FAST-NEXT: vpermt2d %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12>
ret <8 x i32> %shuffle
}
@@ -2243,21 +2304,47 @@ define <8 x i32> @shuffle_v8i32_fedc7654
}
define <8 x i32> @shuffle_v8i32_ba987654(<8 x i32> %a, <8 x i32> %b) {
-; ALL-LABEL: shuffle_v8i32_ba987654:
-; ALL: # %bb.0:
-; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8i32_ba987654:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i32_ba987654:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i32_ba987654:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12]
+; AVX512VL-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512VL-FAST-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) {
-; ALL-LABEL: shuffle_v8i32_ba983210:
-; ALL: # %bb.0:
-; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8i32_ba983210:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i32_ba983210:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i32_ba983210:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12]
+; AVX512VL-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512VL-FAST-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
ret <8 x i32> %shuffle
}
@@ -2903,14 +2990,32 @@ define <8 x float> @broadcast_concat_cra
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: broadcast_concat_crash:
-; AVX2OR512VL: # %bb.0: # %entry
-; AVX2OR512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,2,3]
-; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2OR512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX2OR512VL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: broadcast_concat_crash:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,2,3]
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: broadcast_concat_crash:
+; AVX512VL-SLOW: # %bb.0: # %entry
+; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,2,3]
+; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512VL-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: broadcast_concat_crash:
+; AVX512VL-FAST: # %bb.0: # %entry
+; AVX512VL-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,2,3]
+; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [1,4,3,3]
+; AVX512VL-FAST-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1
+; AVX512VL-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-FAST-NEXT: retq
entry:
%tmp = shufflevector <4 x float> %x, <4 x float> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%bc = bitcast <8 x float> %tmp to <4 x i64>
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v64.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v64.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v64.ll Sun Apr 28 07:31:01 2019
@@ -595,9 +595,8 @@ define <64 x i8> @shuffle_v64i8_shift_00
;
; AVX512VBMI-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vpsrlw $8, %zmm0, %zmm0
-; AVX512VBMI-NEXT: vpsrlw $8, %zmm1, %zmm1
-; AVX512VBMI-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,5,7,9,11,13,15,65,67,69,71,73,75,77,79,17,19,21,23,25,27,29,31,81,83,85,87,89,91,93,95,33,35,37,39,41,43,45,47,97,99,101,103,105,107,109,111,49,51,53,55,57,59,61,63,113,115,117,119,121,123,125,127]
+; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0
; AVX512VBMI-NEXT: retq
%1 = lshr <32 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%2 = lshr <32 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll Sun Apr 28 07:31:01 2019
@@ -84,10 +84,8 @@ define <64 x i8> @combine_vpermi2var_64i
define <16 x i8> @combine_vpermt2var_vpermi2var_16i8_as_vperm2(<16 x i8> %x0, <16 x i8> %x1) {
; CHECK-LABEL: combine_vpermt2var_vpermi2var_16i8_as_vperm2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,31,2,29,4,27,6,25,8,23,10,21,12,19,14,17]
-; CHECK-NEXT: vpermi2b %xmm1, %xmm0, %xmm2
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,17,2,18,4,19,6,21,8,23,10,25,12,27,14,29]
-; CHECK-NEXT: vpermi2b %xmm2, %xmm2, %xmm0
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
+; CHECK-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
%res0 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> <i8 0, i8 31, i8 2, i8 29, i8 4, i8 27, i8 6, i8 25, i8 8, i8 23, i8 10, i8 21, i8 12, i8 19, i8 14, i8 17>, <16 x i8> %x1, i16 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> <i8 0, i8 17, i8 2, i8 18, i8 4, i8 19, i8 6, i8 21, i8 8, i8 23, i8 10, i8 25, i8 12, i8 27, i8 14, i8 29>, <16 x i8> %res0, <16 x i8> %res0, i16 -1)
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll Sun Apr 28 07:31:01 2019
@@ -769,9 +769,9 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_
; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
-; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [9,1,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0
; AVX512VL-NEXT: kmovw %k0, %eax
; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
; AVX512VL-NEXT: vzeroupper
@@ -782,10 +782,10 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_
; VL_BW_DQ-NEXT: vpsllw $15, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
-; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; VL_BW_DQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
+; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [9,1,2,3,4,5,6,7]
+; VL_BW_DQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; VL_BW_DQ-NEXT: vpermt2d %ymm0, %ymm1, %ymm2
+; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0
; VL_BW_DQ-NEXT: kmovd %k0, %eax
; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
; VL_BW_DQ-NEXT: vzeroupper
Modified: llvm/trunk/test/CodeGen/X86/vector-trunc-widen.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc-widen.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc-widen.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc-widen.ll Sun Apr 28 07:31:01 2019
@@ -1649,13 +1649,35 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
;
-; AVX512-LABEL: trunc2x4i32_8i16:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512-NEXT: retq
+; AVX512F-LABEL: trunc2x4i32_8i16:
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: trunc2x4i32_8i16:
+; AVX512VL: # %bb.0: # %entry
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: trunc2x4i32_8i16:
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: trunc2x4i32_8i16:
+; AVX512BWVL: # %bb.0: # %entry
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14]
+; AVX512BWVL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
+; AVX512BWVL-NEXT: retq
entry:
%0 = trunc <4 x i32> %a to <4 x i16>
%1 = trunc <4 x i32> %b to <4 x i16>
Modified: llvm/trunk/test/CodeGen/X86/vector-trunc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc.ll?rev=359400&r1=359399&r2=359400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc.ll Sun Apr 28 07:31:01 2019
@@ -1560,12 +1560,10 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x
;
; AVX512BWVL-LABEL: trunc2x4i64_8i16:
; AVX512BWVL: # %bb.0: # %entry
-; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0
+; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm2
; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,2,4,6,8,10,12,14]
+; AVX512BWVL-NEXT: vpermi2w %xmm1, %xmm2, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
entry:
@@ -1657,13 +1655,35 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
;
-; AVX512-LABEL: trunc2x4i32_8i16:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512-NEXT: retq
+; AVX512F-LABEL: trunc2x4i32_8i16:
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: trunc2x4i32_8i16:
+; AVX512VL: # %bb.0: # %entry
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: trunc2x4i32_8i16:
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: trunc2x4i32_8i16:
+; AVX512BWVL: # %bb.0: # %entry
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14]
+; AVX512BWVL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
+; AVX512BWVL-NEXT: retq
entry:
%0 = trunc <4 x i32> %a to <4 x i16>
%1 = trunc <4 x i32> %b to <4 x i16>
More information about the llvm-commits
mailing list