[llvm] 18c1944 - [X86][AVX] combineX86ShuffleChain - combine binary shuffles to X86ISD::VPERM2X128
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 10 03:48:01 PDT 2020
Author: Simon Pilgrim
Date: 2020-03-10T10:44:28Z
New Revision: 18c19441d10512e2b37ce829743443eed009147f
URL: https://github.com/llvm/llvm-project/commit/18c19441d10512e2b37ce829743443eed009147f
DIFF: https://github.com/llvm/llvm-project/commit/18c19441d10512e2b37ce829743443eed009147f.diff
LOG: [X86][AVX] combineX86ShuffleChain - combine binary shuffles to X86ISD::VPERM2X128
For pre-AVX512 targets, combine binary shuffles to X86ISD::VPERM2X128 if possible. This mainly helps optimize the blend(extract_subvector(x,1),y) pattern.
At some point soon we're going to have make a decision about when to combine AVX512 shuffles more aggressively - we bail out if there is any change in element size (to protect predicate mask merging) which means we miss out on a lot of optimizations.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avg.ll
llvm/test/CodeGen/X86/masked_store_trunc.ll
llvm/test/CodeGen/X86/pr34592.ll
llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
llvm/test/CodeGen/X86/vector-trunc.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ab677a097db4..e4533e0d37d0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -33868,24 +33868,46 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
// Handle 128-bit lane shuffles of 256-bit vectors.
- // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
- // we need to use the zeroing feature.
- // TODO - this should support binary shuffles.
- if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
- !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
- !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
+ if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
return SDValue(); // Nothing to do!
MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
- unsigned PermMask = 0;
- PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
- PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
-
- Res = DAG.getBitcast(ShuffleVT, V1);
- Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
- DAG.getUNDEF(ShuffleVT),
- DAG.getTargetConstant(PermMask, DL, MVT::i8));
- return DAG.getBitcast(RootVT, Res);
+
+ // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
+ // we need to use the zeroing feature.
+ if (UnaryShuffle &&
+ !(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) &&
+ !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
+ unsigned PermMask = 0;
+ PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
+ PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
+
+ Res = DAG.getBitcast(ShuffleVT, V1);
+ Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
+ DAG.getUNDEF(ShuffleVT),
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
+ return DAG.getBitcast(RootVT, Res);
+ }
+
+ // TODO - handle AVX512VL cases with X86ISD::SHUF128.
+ if (!UnaryShuffle && !IsEVEXShuffle) {
+ assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) &&
+ "Unexpected shuffle sentinel value");
+ // Prefer blends to X86ISD::VPERM2X128.
+ if (!((BaseMask[0] == 0 && BaseMask[1] == 3) ||
+ (BaseMask[0] == 2 && BaseMask[1] == 1))) {
+ unsigned PermMask = 0;
+ PermMask |= ((BaseMask[0] & 3) << 0);
+ PermMask |= ((BaseMask[1] & 3) << 4);
+
+ Res = DAG.getNode(
+ X86ISD::VPERM2X128, DL, ShuffleVT,
+ DAG.getBitcast(ShuffleVT, isInRange(BaseMask[0], 0, 2) ? V1 : V2),
+ DAG.getBitcast(ShuffleVT, isInRange(BaseMask[1], 0, 2) ? V1 : V2),
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
+ return DAG.getBitcast(RootVT, Res);
+ }
+ }
}
// For masks that have been widened to 128-bit elements or more,
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
index d2e8f6d743fe..06401418f223 100644
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -377,21 +377,21 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
;
; AVX2-LABEL: avg_v48i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm0
+; AVX2-NEXT: vpbroadcastq 24(%rdi), %xmm0
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpbroadcastq 24(%rdi), %xmm2
+; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm2
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpbroadcastq 40(%rdi), %xmm4
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpbroadcastq 8(%rsi), %xmm6
+; AVX2-NEXT: vpbroadcastq 24(%rsi), %xmm6
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
; AVX2-NEXT: vpaddd %ymm6, %ymm0, %ymm0
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpaddd %ymm6, %ymm1, %ymm1
-; AVX2-NEXT: vpbroadcastq 24(%rsi), %xmm6
+; AVX2-NEXT: vpbroadcastq 8(%rsi), %xmm6
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
@@ -423,10 +423,9 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
; AVX2-NEXT: vpackusdw %ymm6, %ymm2, %ymm2
; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm2
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT: vpackuswb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm0[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[2,3],ymm4[2,3]
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3
; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll
index 5b2bf3b177cf..5c8c40ed4bad 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll
@@ -5146,12 +5146,11 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, <32 x i8>* %p, <32 x i8> %ma
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vpackuswb %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm1
; AVX2-NEXT: vpmovmskb %ymm1, %eax
; AVX2-NEXT: notl %eax
diff --git a/llvm/test/CodeGen/X86/pr34592.ll b/llvm/test/CodeGen/X86/pr34592.ll
index 9249450a3d2a..0f73036a4c6c 100644
--- a/llvm/test/CodeGen/X86/pr34592.ll
+++ b/llvm/test/CodeGen/X86/pr34592.ll
@@ -10,7 +10,7 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: andq $-32, %rsp
-; CHECK-NEXT: subq $128, %rsp
+; CHECK-NEXT: subq $192, %rsp
; CHECK-NEXT: vmovaps 240(%rbp), %ymm8
; CHECK-NEXT: vmovaps 208(%rbp), %ymm9
; CHECK-NEXT: vmovaps 176(%rbp), %ymm10
@@ -20,36 +20,37 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
; CHECK-NEXT: vmovaps 48(%rbp), %ymm14
; CHECK-NEXT: vmovaps 16(%rbp), %ymm15
; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
-; CHECK-NEXT: vmovaps %xmm9, %xmm6
+; CHECK-NEXT: vmovaps %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: # implicit-def: $ymm0
-; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
-; CHECK-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
-; CHECK-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,0]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7]
-; CHECK-NEXT: # kill: def $xmm2 killed $xmm2 killed $ymm2
-; CHECK-NEXT: # implicit-def: $ymm11
-; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm11, %ymm11
-; CHECK-NEXT: vextracti128 $1, %ymm7, %xmm2
-; CHECK-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero
-; CHECK-NEXT: # implicit-def: $ymm6
-; CHECK-NEXT: vmovaps %xmm2, %xmm6
-; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7]
-; CHECK-NEXT: vmovaps %xmm7, %xmm6
-; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7]
-; CHECK-NEXT: # implicit-def: $ymm11
-; CHECK-NEXT: vmovaps %xmm6, %xmm11
-; CHECK-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
+; CHECK-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0
+; CHECK-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,0]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
+; CHECK-NEXT: vmovaps %xmm7, %xmm9
+; CHECK-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7]
+; CHECK-NEXT: # implicit-def: $ymm2
+; CHECK-NEXT: vmovaps %xmm9, %xmm2
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; CHECK-NEXT: vpalignr {{.*#+}} ymm9 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
; CHECK-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,3]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7]
-; CHECK-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,1,3]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7]
+; CHECK-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,1,3]
; CHECK-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,0,1,4,5,4,5]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
+; CHECK-NEXT: vextracti128 $1, %ymm7, %xmm7
+; CHECK-NEXT: vmovq {{.*#+}} xmm7 = xmm7[0],zero
+; CHECK-NEXT: # implicit-def: $ymm8
+; CHECK-NEXT: vmovaps %xmm7, %xmm8
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm8[0,1],ymm6[0,1]
; CHECK-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: vmovaps %ymm5, %ymm1
+; CHECK-NEXT: vmovaps %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovaps %ymm6, %ymm2
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; CHECK-NEXT: vmovaps %ymm3, (%rsp) # 32-byte Spill
-; CHECK-NEXT: vmovaps %ymm9, %ymm3
+; CHECK-NEXT: vmovaps %ymm5, %ymm3
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: .cfi_def_cfa %rsp, 8
diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
index 37bd440e7e7c..a18e47cd9984 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
@@ -745,12 +745,11 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) {
; AVX2-LABEL: trunc_v32i16_v32i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vpackuswb %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: cmpl $-1, %eax
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
index 07079ece2fad..a9d4f50502a7 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
@@ -739,12 +739,11 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) {
; AVX2-LABEL: trunc_v32i16_v32i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vpackuswb %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: testl %eax, %eax
diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
index 1dc839734e0e..fb2b99543e36 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
@@ -831,12 +831,11 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) {
; AVX2-LABEL: trunc_v32i16_v32i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vpackuswb %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: movl %eax, %ecx
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 4cdddb4391e4..c8136f4a18e7 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -439,9 +439,9 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: retq
;
@@ -462,9 +462,9 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0
;
; XOPAVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 1e1bf04258cf..1d7711cc53ef 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -915,9 +915,9 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
@@ -945,9 +945,9 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_
;
; XOPAVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 18, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -967,9 +967,9 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
@@ -997,9 +997,9 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_
;
; XOPAVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index f3bbc15c3696..9a1199f1ac59 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -1305,12 +1305,11 @@ define void @trunc32i16_32i8(<32 x i16> %a) {
; AVX2-LABEL: trunc32i16_32i8:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vpackuswb %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
More information about the llvm-commits
mailing list