[llvm] 618a890 - [X86] Increase the depth threshold required to form VPERMI2W/VPERMI2B in shuffle combining
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 29 18:38:23 PDT 2020
Author: Craig Topper
Date: 2020-09-29T18:37:23-07:00
New Revision: 618a890b72f874cbc41168737d03f724f58805fc
URL: https://github.com/llvm/llvm-project/commit/618a890b72f874cbc41168737d03f724f58805fc
DIFF: https://github.com/llvm/llvm-project/commit/618a890b72f874cbc41168737d03f724f58805fc.diff
LOG: [X86] Increase the depth threshold required to form VPERMI2W/VPERMI2B in shuffle combining
These instructions are implemented with two port 5 uops and one port 015 uop so they are more complicated that most shuffles.
This patch increases the depth threshold for when we form them during shuffle combining to try to limit increasing the number of uops especially on port 5.
Differential Revision: https://reviews.llvm.org/D88503
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/min-legal-vector-width.ll
llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
llvm/test/CodeGen/X86/vector-zext.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2a7f028d3789..4b3adc7dcfbc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -35351,6 +35351,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Depth threshold above which we can efficiently use variable mask shuffles.
int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
+ // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
+ // higher depth before combining them.
+ bool AllowBWIVPERMV3 = (Depth >= 2 || HasVariableMask);
bool MaskContainsZeros = isAnyZero(Mask);
@@ -35387,9 +35390,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
- (Subtarget.hasBWI() &&
+ (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
- (Subtarget.hasVBMI() &&
+ (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
// Adjust shuffle mask - replace SM_SentinelZero with second source index.
for (unsigned i = 0; i != NumMaskElts; ++i)
@@ -35416,9 +35419,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
- (Subtarget.hasBWI() &&
+ (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
- (Subtarget.hasVBMI() &&
+ (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
V1 = DAG.getBitcast(MaskVT, V1);
V2 = DAG.getBitcast(MaskVT, V2);
@@ -35588,10 +35591,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
MaskVT == MVT::v16i32)) ||
- (Subtarget.hasBWI() && (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
- MaskVT == MVT::v32i16)) ||
- (Subtarget.hasVBMI() && (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
- MaskVT == MVT::v64i8)))) {
+ (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
+ (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
+ (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
+ (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
V1 = DAG.getBitcast(MaskVT, V1);
V2 = DAG.getBitcast(MaskVT, V2);
Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index e5240d5e246a..a39fbf878fd9 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -857,10 +857,10 @@ define <8 x i16> @trunc_v8i64_v8i16(<8 x i64>* %x) nounwind "min-legal-vector-wi
define <8 x i32> @trunc_v8i64_v8i32_zeroes(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" {
; CHECK-LABEL: trunc_v8i64_v8i32_zeroes:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpsrlq $48, 32(%rdi), %ymm1
-; CHECK-NEXT: vpsrlq $48, (%rdi), %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; CHECK-NEXT: vpermi2w %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: vpsrlq $48, 32(%rdi), %ymm0
+; CHECK-NEXT: vpsrlq $48, (%rdi), %ymm1
+; CHECK-NEXT: vpackusdw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; CHECK-NEXT: retq
%a = load <8 x i64>, <8 x i64>* %x
%b = lshr <8 x i64> %a, <i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48>
@@ -920,9 +920,10 @@ define <8 x i32> @trunc_v8i64_v8i32_sign(<8 x i64>* %x) nounwind "min-legal-vect
define <16 x i16> @trunc_v16i32_v16i16_sign(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" {
; CHECK-LABEL: trunc_v16i32_v16i16_sign:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa (%rdi), %ymm1
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
-; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
+; CHECK-NEXT: vpsrad $16, 32(%rdi), %ymm0
+; CHECK-NEXT: vpsrad $16, (%rdi), %ymm1
+; CHECK-NEXT: vpackssdw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; CHECK-NEXT: retq
%a = load <16 x i32>, <16 x i32>* %x
%b = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -931,20 +932,13 @@ define <16 x i16> @trunc_v16i32_v16i16_sign(<16 x i32>* %x) nounwind "min-legal-
}
define <32 x i8> @trunc_v32i16_v32i8_sign(<32 x i16>* %x) nounwind "min-legal-vector-width"="256" {
-; CHECK-AVX512-LABEL: trunc_v32i16_v32i8_sign:
-; CHECK-AVX512: # %bb.0:
-; CHECK-AVX512-NEXT: vpsraw $8, 32(%rdi), %ymm0
-; CHECK-AVX512-NEXT: vpsraw $8, (%rdi), %ymm1
-; CHECK-AVX512-NEXT: vpacksswb %ymm0, %ymm1, %ymm0
-; CHECK-AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; CHECK-AVX512-NEXT: retq
-;
-; CHECK-VBMI-LABEL: trunc_v32i16_v32i8_sign:
-; CHECK-VBMI: # %bb.0:
-; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1
-; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63]
-; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0
-; CHECK-VBMI-NEXT: retq
+; CHECK-LABEL: trunc_v32i16_v32i8_sign:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsraw $8, 32(%rdi), %ymm0
+; CHECK-NEXT: vpsraw $8, (%rdi), %ymm1
+; CHECK-NEXT: vpacksswb %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; CHECK-NEXT: retq
%a = load <32 x i16>, <32 x i16>* %x
%b = ashr <32 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%c = trunc <32 x i16> %b to <32 x i8>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
index fb300a88b412..ee3cf43e8f2f 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -304,24 +304,11 @@ define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX2-NEXT: retq
-;
-; AVX512VLBW-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
-; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512VLBW-NEXT: retq
-;
-; AVX512VLVBMI-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
-; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16,0,17,0,18,0,19,0,20,0,21,0,22,0,23]
-; AVX512VLVBMI-NEXT: vpermi2b %xmm0, %xmm1, %xmm2
-; AVX512VLVBMI-NEXT: vmovdqa %xmm2, %xmm0
-; AVX512VLVBMI-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2OR512VL-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
; XOPAVX1: # %bb.0:
@@ -1335,23 +1322,11 @@ define <16 x i8> @shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23(
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE-NEXT: retq
;
-; AVX1OR2-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX1OR2-NEXT: retq
-;
-; AVX512VLBW-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
-; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512VLBW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX512VLBW-NEXT: retq
-;
-; AVX512VLVBMI-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
-; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16,1,17,4,20,5,21,2,18,3,19,6,22,7,23]
-; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
-; AVX512VLVBMI-NEXT: retq
+; AVX-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
+; AVX: # %bb.0:
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %val1, <16 x i8> %val2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23>
ret <16 x i8> %shuffle
}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
index c72d736960f9..f7baebf7c4e4 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -1017,23 +1017,11 @@ define <8 x i16> @shuffle_v8i16_0c1d2e3f(<8 x i16> %a, <8 x i16> %b) {
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: retq
;
-; AVX1OR2-LABEL: shuffle_v8i16_0c1d2e3f:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-SLOW-LABEL: shuffle_v8i16_0c1d2e3f:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v8i16_0c1d2e3f:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,12,1,13,2,14,3,15]
-; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
-; AVX512VL-FAST-NEXT: retq
+; AVX-LABEL: shuffle_v8i16_0c1d2e3f:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 2, i32 14, i32 3, i32 15>
ret <8 x i16> %shuffle
}
@@ -1059,23 +1047,11 @@ define <8 x i16> @shuffle_v8i16_48596a7b(<8 x i16> %a, <8 x i16> %b) {
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: retq
;
-; AVX1OR2-LABEL: shuffle_v8i16_48596a7b:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-SLOW-LABEL: shuffle_v8i16_48596a7b:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v8i16_48596a7b:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,8,5,9,6,10,7,11]
-; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
-; AVX512VL-FAST-NEXT: retq
+; AVX-LABEL: shuffle_v8i16_48596a7b:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 8, i32 5, i32 9, i32 6, i32 10, i32 7, i32 11>
ret <8 x i16> %shuffle
}
@@ -1424,23 +1400,11 @@ define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: shuffle_v8i16_012dXXXX:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-SLOW-LABEL: shuffle_v8i16_012dXXXX:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v8i16_012dXXXX:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,13,4,5,6,7]
-; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
-; AVX512VL-FAST-NEXT: retq
+; AVX-LABEL: shuffle_v8i16_012dXXXX:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
@@ -1475,24 +1439,11 @@ define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) {
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i16_XXXXcde3:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
-; AVX2-NEXT: retq
-;
-; AVX512VL-SLOW-LABEL: shuffle_v8i16_XXXXcde3:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0
-; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v8i16_XXXXcde3:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,11]
-; AVX512VL-FAST-NEXT: vpermi2w %xmm0, %xmm1, %xmm2
-; AVX512VL-FAST-NEXT: vmovdqa %xmm2, %xmm0
-; AVX512VL-FAST-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i16_XXXXcde3:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
+; AVX2OR512VL-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v8i16_XXXXcde3:
; XOPAVX1: # %bb.0:
@@ -1533,24 +1484,11 @@ define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: shuffle_v8i16_cde3XXXX:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-SLOW-LABEL: shuffle_v8i16_cde3XXXX:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v8i16_cde3XXXX:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,6,11,4,5,6,7]
-; AVX512VL-FAST-NEXT: vpermi2w %xmm0, %xmm1, %xmm2
-; AVX512VL-FAST-NEXT: vmovdqa %xmm2, %xmm0
-; AVX512VL-FAST-NEXT: retq
+; AVX-LABEL: shuffle_v8i16_cde3XXXX:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 13, i32 14, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 23bf91de6e7e..e3eed625dab3 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -4804,29 +4804,11 @@ define <4 x i64> @PR28136(<32 x i8> %a0, <32 x i8> %a1) {
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: PR28136:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: retq
-;
-; AVX512VLBW-LABEL: PR28136:
-; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VLBW-NEXT: retq
-;
-; AVX512VLVBMI-SLOW-LABEL: PR28136:
-; AVX512VLVBMI-SLOW: # %bb.0:
-; AVX512VLVBMI-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VLVBMI-SLOW-NEXT: retq
-;
-; AVX512VLVBMI-FAST-LABEL: PR28136:
-; AVX512VLVBMI-FAST: # %bb.0:
-; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,16,48,17,49,18,50,19,51,4,36,5,37,6,38,7,39,20,52,21,53,22,54,23,55]
-; AVX512VLVBMI-FAST-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
-; AVX512VLVBMI-FAST-NEXT: retq
+; AVX2OR512VL-LABEL: PR28136:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2OR512VL-NEXT: retq
;
; XOPAVX1-LABEL: PR28136:
; XOPAVX1: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll
index 2ad16f2e04c5..0132e901e6b3 100644
--- a/llvm/test/CodeGen/X86/vector-zext.ll
+++ b/llvm/test/CodeGen/X86/vector-zext.ll
@@ -1902,20 +1902,11 @@ define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: shuf_zext_8i16_to_4i64_offset2:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: shuf_zext_8i16_to_4i64_offset2:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,33,34,35,3,37,38,39,4,41,42,43,5,45,46,47]
-; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT: vpermt2w %zmm2, %zmm1, %zmm0
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: shuf_zext_8i16_to_4i64_offset2:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
+; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8>
%Z = bitcast <16 x i16> %B to <4 x i64>
More information about the llvm-commits
mailing list