[llvm] 9b645eb - [X86][AVX] Use lowerShuffleWithPERMV in shuffle combining to support non-VLX targets
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 7 05:00:51 PDT 2020
Author: Simon Pilgrim
Date: 2020-09-07T12:50:50+01:00
New Revision: 9b645ebfff168fcf3cf29b21f49762a04d8ceb37
URL: https://github.com/llvm/llvm-project/commit/9b645ebfff168fcf3cf29b21f49762a04d8ceb37
DIFF: https://github.com/llvm/llvm-project/commit/9b645ebfff168fcf3cf29b21f49762a04d8ceb37.diff
LOG: [X86][AVX] Use lowerShuffleWithPERMV in shuffle combining to support non-VLX targets
lowerShuffleWithPERMV allows us to use the ZMM variants for 128/256-bit variable shuffles on non-VLX AVX512 targets.
This is another step towards shuffle combining through between vector widths - we still end up with an annoying regression (combine_vpermilvar_vperm2f128_zero_8f32) but we're going in the right direction....
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/insertelement-ones.ll
llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
llvm/test/CodeGen/X86/vector-zext.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 09855fd0eb92..a8a41d9a1bb7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -35240,44 +35240,48 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
// If we have a single input lane-crossing shuffle then lower to VPERMV.
- if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
- ((Subtarget.hasAVX2() &&
- (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
- (Subtarget.hasAVX512() &&
- (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
- MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
- (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
- (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
- (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
- (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
- SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
- Res = DAG.getBitcast(MaskVT, V1);
- Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
- return DAG.getBitcast(RootVT, Res);
+ if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros) {
+ if (Subtarget.hasAVX2() &&
+ (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
+ SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
+ Res = DAG.getBitcast(MaskVT, V1);
+ Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
+ return DAG.getBitcast(RootVT, Res);
+ }
+ // AVX512 variants (non-VLX will pad to 512-bit shuffles).
+ if ((Subtarget.hasAVX512() &&
+ (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+ MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
+ (Subtarget.hasBWI() &&
+ (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
+ (Subtarget.hasVBMI() &&
+ (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
+ V1 = DAG.getBitcast(MaskVT, V1);
+ V2 = DAG.getUNDEF(MaskVT);
+ Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
+ return DAG.getBitcast(RootVT, Res);
+ }
}
// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
- // vector as the second source.
+ // vector as the second source (non-VLX will pad to 512-bit shuffles).
if (UnaryShuffle && AllowVariableMask &&
((Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+ MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
+ MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
- (Subtarget.hasVLX() &&
- (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
- MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
- (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
- (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
- (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
- (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
+ (Subtarget.hasBWI() &&
+ (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
+ (Subtarget.hasVBMI() &&
+ (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
// Adjust shuffle mask - replace SM_SentinelZero with second source index.
for (unsigned i = 0; i != NumMaskElts; ++i)
if (Mask[i] == SM_SentinelZero)
Mask[i] = NumMaskElts + i;
-
- SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
- Res = DAG.getBitcast(MaskVT, V1);
- SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
- Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
+ V1 = DAG.getBitcast(MaskVT, V1);
+ V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
+ Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
return DAG.getBitcast(RootVT, Res);
}
@@ -35288,22 +35292,21 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
DAG, Subtarget))
return WideShuffle;
- // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
+ // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
+ // (non-VLX will pad to 512-bit shuffles).
if (AllowVariableMask && !MaskContainsZeros &&
((Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
- MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
- (Subtarget.hasVLX() &&
- (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
+ MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
+ MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
- (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
- (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
- (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
- (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
- SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
+ (Subtarget.hasBWI() &&
+ (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
+ (Subtarget.hasVBMI() &&
+ (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
V1 = DAG.getBitcast(MaskVT, V1);
V2 = DAG.getBitcast(MaskVT, V2);
- Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
+ Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
return DAG.getBitcast(RootVT, Res);
}
return SDValue();
@@ -35460,25 +35463,22 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
DAG, Subtarget))
return WideShuffle;
- // If we have a dual input shuffle then lower to VPERMV3.
+ // If we have a dual input shuffle then lower to VPERMV3,
+ // (non-VLX will pad to 512-bit shuffles)
if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
((Subtarget.hasAVX512() &&
- (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
- MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
- (Subtarget.hasVLX() &&
- (MaskVT == MVT::v2f64 || MaskVT == MVT::v2i64 || MaskVT == MVT::v4f64 ||
- MaskVT == MVT::v4i64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 ||
- MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
- (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
- (Subtarget.hasBWI() && Subtarget.hasVLX() &&
- (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16)) ||
- (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
- (Subtarget.hasVBMI() && Subtarget.hasVLX() &&
- (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8)))) {
- SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
+ (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
+ MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
+ MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
+ MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
+ MaskVT == MVT::v16i32)) ||
+ (Subtarget.hasBWI() && (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
+ MaskVT == MVT::v32i16)) ||
+ (Subtarget.hasVBMI() && (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
+ MaskVT == MVT::v64i8)))) {
V1 = DAG.getBitcast(MaskVT, V1);
V2 = DAG.getBitcast(MaskVT, V2);
- Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
+ Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
return DAG.getBitcast(RootVT, Res);
}
diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll
index 6a9a401264c5..67d8479cf736 100644
--- a/llvm/test/CodeGen/X86/insertelement-ones.ll
+++ b/llvm/test/CodeGen/X86/insertelement-ones.ll
@@ -299,11 +299,11 @@ define <16 x i16> @insert_v16i16_x12345x789ABCDEx(<16 x i16> %a) {
;
; AVX512F-LABEL: insert_v16i16_x12345x789ABCDEx:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7]
-; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7],ymm0[8,9,10,11,12,13,14],ymm2[15]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [32,1,2,3,4,5,38,7,8,9,10,11,12,13,14,47]
+; AVX512F-NEXT: vpermt2w %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: insert_v16i16_x12345x789ABCDEx:
diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
index 51df4c0505b5..3f6b85c97c40 100644
--- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
@@ -71,13 +71,12 @@ define void @shuffle_v16i16_to_v8i16_1(<16 x i16>* %L, <8 x i16>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v16i16_to_v8i16_1:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [1,3,5,7,33,35,37,39]
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16_1:
@@ -252,13 +251,12 @@ define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [1,5,33,37,4,5,36,37]
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT: vmovq %xmm1, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1:
@@ -329,13 +327,12 @@ define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [2,6,34,38,2,3,34,35]
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT: vmovq %xmm1, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2:
@@ -406,13 +403,12 @@ define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [3,7,35,39,2,3,34,35]
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT: vmovq %xmm1, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3:
diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
index e0994e5b58fa..d789f0e1d39f 100644
--- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
@@ -31,13 +31,13 @@ define void @shuffle_v64i8_to_v32i8_1(<64 x i8>* %L, <32 x i8>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v64i8_to_v32i8_1:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,9,11]
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT: vmovdqa %ymm1, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -129,10 +129,11 @@ define void @shuffle_v16i32_to_v8i32_1(<16 x i32>* %L, <8 x i32>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v16i32_to_v8i32_1:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovaps (%rdi), %ymm0
-; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],mem[1,3],ymm0[5,7],mem[5,7]
-; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,17,19,21,23]
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT: vmovdqa %ymm1, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
index 4fce1a38a754..7d3e8f66ed39 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -378,11 +378,10 @@ define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) {
define <8 x float> @test_v16f32_0_1_2_3_4_6_7_10 (<16 x float> %v) {
; ALL-LABEL: test_v16f32_0_1_2_3_4_6_7_10:
; ALL: # %bb.0:
-; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm1
-; ALL-NEXT: vmovsldup {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,7,u]
-; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; ALL-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,6,7,10,0,1,2,3,4,6,7,10]
+; ALL-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
+; ALL-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; ALL-NEXT: retq
%res = shufflevector <16 x float> %v, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 10>
ret <8 x float> %res
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
index cb2dd3ef7e86..ccf1476e6a65 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -35,11 +35,11 @@ define <8 x float> @expand1(<4 x float> %a ) {
;
; KNL-LABEL: expand1:
; KNL: # %bb.0:
-; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; KNL-NEXT: vmovaps {{.*#+}} ymm1 = <u,0,u,1,u,2,u,3>
-; KNL-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT: vmovaps {{.*#+}} ymm1 = [16,0,18,1,20,2,22,3]
+; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; KNL-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0
+; KNL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; KNL-NEXT: ret{{[l|q]}}
%res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
ret <8 x float> %res
@@ -268,10 +268,11 @@ define <8 x float> @expand14(<4 x float> %a) {
;
; KNL-LABEL: expand14:
; KNL: # %bb.0:
-; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; KNL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
-; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
+; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT: vmovaps {{.*#+}} ymm1 = [16,17,0,19,1,21,22,23]
+; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; KNL-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0
+; KNL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; KNL-NEXT: ret{{[l|q]}}
%addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>
%res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> <i32 3, i32 3, i32 4, i32 0, i32 5, i32 0, i32 0, i32 0>
@@ -476,9 +477,11 @@ define <8 x float> @test_masked_permps_v8f32(<8 x float>* %vp, <8 x float> %vec2
;
; KNL64-LABEL: test_masked_permps_v8f32:
; KNL64: # %bb.0:
-; KNL64-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,2,3,7,6,6,7]
-; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,0,2,3]
-; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6,7]
+; KNL64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL64-NEXT: vmovaps (%rdi), %ymm1
+; KNL64-NEXT: vmovaps {{.*#+}} ymm2 = [7,6,3,19,7,6,22,23]
+; KNL64-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
+; KNL64-NEXT: vmovaps %ymm1, %ymm0
; KNL64-NEXT: retq
;
; SKX32-LABEL: test_masked_permps_v8f32:
@@ -492,10 +495,12 @@ define <8 x float> @test_masked_permps_v8f32(<8 x float>* %vp, <8 x float> %vec2
;
; KNL32-LABEL: test_masked_permps_v8f32:
; KNL32: # %bb.0:
+; KNL32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; KNL32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL32-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,2,3,7,6,6,7]
-; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,0,2,3]
-; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6,7]
+; KNL32-NEXT: vmovaps (%eax), %ymm1
+; KNL32-NEXT: vmovaps {{.*#+}} ymm2 = [7,6,3,19,7,6,22,23]
+; KNL32-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
+; KNL32-NEXT: vmovaps %ymm1, %ymm0
; KNL32-NEXT: retl
%vec = load <8 x float>, <8 x float>* %vp
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 3, i32 0, i32 7, i32 6, i32 3, i32 0>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index e744dbd10336..47c1e67e096a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -132,10 +132,19 @@ define <8 x float> @combine_vpermilvar_vperm2f128_8f32(<8 x float> %a0) {
}
define <8 x float> @combine_vpermilvar_vperm2f128_zero_8f32(<8 x float> %a0) {
-; CHECK-LABEL: combine_vpermilvar_vperm2f128_zero_8f32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
-; CHECK-NEXT: ret{{[l|q]}}
+; AVX-LABEL: combine_vpermilvar_vperm2f128_zero_8f32:
+; AVX: # %bb.0:
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
+; AVX-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: combine_vpermilvar_vperm2f128_zero_8f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [16,17,18,19,3,2,1,0]
+; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0
+; AVX512-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512-NEXT: ret{{[l|q]}}
%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
%2 = shufflevector <8 x float> %1, <8 x float> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 0, i32 1, i32 2, i32 3>
%3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %2, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index a4d7b26ef869..e7287162dfcb 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -817,15 +817,26 @@ define <32 x i8> @PR27320(<8 x i32> %a0) {
}
define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1, <8 x float> %inp2) {
-; CHECK-LABEL: PR34577:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
-; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
-; CHECK-NEXT: vpermps %ymm1, %ymm2, %ymm1
-; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; CHECK-NEXT: ret{{[l|q]}}
+; AVX2-LABEL: PR34577:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
+; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: PR34577:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
+; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = <23,18,7,2,20,u,3,2>
+; AVX512-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT: ret{{[l|q]}}
entry:
%shuf0 = shufflevector <8 x float> %inp0, <8 x float> %inp2, <8 x i32> <i32 1, i32 10, i32 11, i32 13, i32 2, i32 13, i32 5, i32 0>
%sel = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x float> %shuf0, <8 x float> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll
index d97b9a359b1a..b43510f7fd19 100644
--- a/llvm/test/CodeGen/X86/vector-zext.ll
+++ b/llvm/test/CodeGen/X86/vector-zext.ll
@@ -1905,11 +1905,20 @@ define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: retq
;
-; AVX512-LABEL: shuf_zext_8i16_to_4i64_offset2:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
-; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuf_zext_8i16_to_4i64_offset2:
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: shuf_zext_8i16_to_4i64_offset2:
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,33,34,35,3,37,38,39,4,41,42,43,5,45,46,47]
+; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT: vpermt2w %zmm2, %zmm1, %zmm0
+; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT: retq
entry:
%B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8>
%Z = bitcast <16 x i16> %B to <4 x i64>
More information about the llvm-commits
mailing list