[llvm] c1a36ba - [DAGCombine][X86][ARM] EXTRACT_SUBVECTOR(VECTOR_SHUFFLE(?,?,Mask)) -> VECTOR_SHUFFLE(EXTRACT_SUBVECTOR(?, ?), EXTRACT_SUBVECTOR(?, ?), Mask')
Roman Lebedev via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 13 09:06:14 PST 2021
Author: Roman Lebedev
Date: 2021-12-13T20:03:44+03:00
New Revision: c1a36ba002b85c59b1f760db10c06ecab11f2c94
URL: https://github.com/llvm/llvm-project/commit/c1a36ba002b85c59b1f760db10c06ecab11f2c94
DIFF: https://github.com/llvm/llvm-project/commit/c1a36ba002b85c59b1f760db10c06ecab11f2c94.diff
LOG: [DAGCombine][X86][ARM] EXTRACT_SUBVECTOR(VECTOR_SHUFFLE(?,?,Mask)) -> VECTOR_SHUFFLE(EXTRACT_SUBVECTOR(?, ?), EXTRACT_SUBVECTOR(?, ?), Mask')
In most test changes this allows us to drop some broadcasts/shuffles.
Reviewed By: RKSimon
Differential Revision: https://reviews.llvm.org/D104156
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/ARM/vext.ll
llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
llvm/test/CodeGen/X86/vector-rotate-256.ll
llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
llvm/test/CodeGen/X86/vector-shift-shl-256.ll
llvm/test/CodeGen/X86/vector-trunc.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index fc6329302c374..912b5985c0e4f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -20744,6 +20744,156 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
return NewLd;
}
+/// Given EXTRACT_SUBVECTOR(VECTOR_SHUFFLE(Op0, Op1, Mask)),
+/// try to produce VECTOR_SHUFFLE(EXTRACT_SUBVECTOR(Op?, ?),
+/// EXTRACT_SUBVECTOR(Op?, ?),
+/// Mask'))
+/// iff it is legal and profitable to do so. Notably, the trimmed mask
+/// (containing only the elements that are extracted)
+/// must reference at most two subvectors.
+static SDValue foldExtractSubvectorFromShuffleVector(SDNode *N,
+ SelectionDAG &DAG,
+ const TargetLowering &TLI,
+ bool LegalOperations) {
+ assert(N->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ "Must only be called on EXTRACT_SUBVECTOR's");
+
+ SDValue N0 = N->getOperand(0);
+
+ // Only deal with non-scalable vectors.
+ EVT NarrowVT = N->getValueType(0);
+ EVT WideVT = N0.getValueType();
+ if (!NarrowVT.isFixedLengthVector() || !WideVT.isFixedLengthVector())
+ return SDValue();
+
+ // The operand must be a shufflevector.
+ auto *WideShuffleVector = dyn_cast<ShuffleVectorSDNode>(N0);
+ if (!WideShuffleVector)
+ return SDValue();
+
+ // The old shuffleneeds to go away.
+ if (!WideShuffleVector->hasOneUse())
+ return SDValue();
+
+ // And the narrow shufflevector that we'll form must be legal.
+ if (LegalOperations &&
+ !TLI.isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, NarrowVT))
+ return SDValue();
+
+ uint64_t FirstExtractedEltIdx = N->getConstantOperandVal(1);
+ int NumEltsExtracted = NarrowVT.getVectorNumElements();
+ assert((FirstExtractedEltIdx % NumEltsExtracted) == 0 &&
+ "Extract index is not a multiple of the output vector length.");
+
+ int WideNumElts = WideVT.getVectorNumElements();
+
+ SmallVector<int, 16> NewMask;
+ NewMask.reserve(NumEltsExtracted);
+ SmallSetVector<std::pair<SDValue /*Op*/, int /*SubvectorIndex*/>, 2>
+ DemandedSubvectors;
+
+ // Try to decode the wide mask into narrow mask from at most two subvectors.
+ for (int M : WideShuffleVector->getMask().slice(FirstExtractedEltIdx,
+ NumEltsExtracted)) {
+ assert((M >= -1) && (M < (2 * WideNumElts)) &&
+ "Out-of-bounds shuffle mask?");
+
+ if (M < 0) {
+ // Does not depend on operands, does not require adjustment.
+ NewMask.emplace_back(M);
+ continue;
+ }
+
+ // From which operand of the shuffle does this shuffle mask element pick?
+ int WideShufOpIdx = M / WideNumElts;
+ // Which element of that operand is picked?
+ int OpEltIdx = M % WideNumElts;
+
+ assert((OpEltIdx + WideShufOpIdx * WideNumElts) == M &&
+ "Shuffle mask vector decomposition failure.");
+
+ // And which NumEltsExtracted-sized subvector of that operand is that?
+ int OpSubvecIdx = OpEltIdx / NumEltsExtracted;
+ // And which element within that subvector of that operand is that?
+ int OpEltIdxInSubvec = OpEltIdx % NumEltsExtracted;
+
+ assert((OpEltIdxInSubvec + OpSubvecIdx * NumEltsExtracted) == OpEltIdx &&
+ "Shuffle mask subvector decomposition failure.");
+
+ assert((OpEltIdxInSubvec + OpSubvecIdx * NumEltsExtracted +
+ WideShufOpIdx * WideNumElts) == M &&
+ "Shuffle mask full decomposition failure.");
+
+ SDValue Op = WideShuffleVector->getOperand(WideShufOpIdx);
+
+ if (Op.isUndef()) {
+ // Picking from an undef operand. Let's adjust mask instead.
+ NewMask.emplace_back(-1);
+ continue;
+ }
+
+ // Profitability check: only deal with extractions from the first subvector.
+ if (OpSubvecIdx != 0)
+ return SDValue();
+
+ const std::pair<SDValue, int> DemandedSubvector =
+ std::make_pair(Op, OpSubvecIdx);
+
+ if (DemandedSubvectors.insert(DemandedSubvector)) {
+ if (DemandedSubvectors.size() > 2)
+ return SDValue(); // We can't handle more than two subvectors.
+ // How many elements into the WideVT does this subvector start?
+ int Index = NumEltsExtracted * OpSubvecIdx;
+ // Bail out if the extraction isn't going to be cheap.
+ if (!TLI.isExtractSubvectorCheap(NarrowVT, WideVT, Index))
+ return SDValue();
+ }
+
+ // Ok, but from which operand of the new shuffle will this element pick?
+ int NewOpIdx =
+ getFirstIndexOf(DemandedSubvectors.getArrayRef(), DemandedSubvector);
+ assert((NewOpIdx == 0 || NewOpIdx == 1) && "Unexpected operand index.");
+
+ int AdjM = OpEltIdxInSubvec + NewOpIdx * NumEltsExtracted;
+ NewMask.emplace_back(AdjM);
+ }
+ assert(NewMask.size() == (unsigned)NumEltsExtracted && "Produced bad mask.");
+ assert(DemandedSubvectors.size() <= 2 &&
+ "Should have ended up demanding at most two subvectors.");
+
+ // Did we discover that the shuffle does not actually depend on operands?
+ if (DemandedSubvectors.empty())
+ return DAG.getUNDEF(NarrowVT);
+
+ // We still perform the exact same EXTRACT_SUBVECTOR, just on
diff erent
+ // operand[s]/index[es], so there is no point in checking for it's legality.
+
+ // Do not turn a legal shuffle into an illegal one.
+ if (TLI.isShuffleMaskLegal(WideShuffleVector->getMask(), WideVT) &&
+ !TLI.isShuffleMaskLegal(NewMask, NarrowVT))
+ return SDValue();
+
+ SDLoc DL(N);
+
+ SmallVector<SDValue, 2> NewOps;
+ for (const std::pair<SDValue /*Op*/, int /*SubvectorIndex*/>
+ &DemandedSubvector : DemandedSubvectors) {
+ // How many elements into the WideVT does this subvector start?
+ int Index = NumEltsExtracted * DemandedSubvector.second;
+ SDValue IndexC = DAG.getVectorIdxConstant(Index, DL);
+ NewOps.emplace_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowVT,
+ DemandedSubvector.first, IndexC));
+ }
+ assert((NewOps.size() == 1 || NewOps.size() == 2) &&
+ "Should end up with either one or two ops");
+
+ // If we ended up with only one operand, pad with an undef.
+ if (NewOps.size() == 1)
+ NewOps.emplace_back(DAG.getUNDEF(NarrowVT));
+
+ return DAG.getVectorShuffle(NarrowVT, DL, NewOps[0], NewOps[1], NewMask);
+}
+
SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
EVT NVT = N->getValueType(0);
SDValue V = N->getOperand(0);
@@ -20857,6 +21007,10 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
}
}
+ if (SDValue V =
+ foldExtractSubvectorFromShuffleVector(N, DAG, TLI, LegalOperations))
+ return V;
+
V = peekThroughBitcasts(V);
// If the input is a build vector. Try to make a smaller build vector.
diff --git a/llvm/test/CodeGen/ARM/vext.ll b/llvm/test/CodeGen/ARM/vext.ll
index 5ff04ef0dd196..3a47d3d95dfb7 100644
--- a/llvm/test/CodeGen/ARM/vext.ll
+++ b/llvm/test/CodeGen/ARM/vext.ll
@@ -222,7 +222,7 @@ define <4 x i16> @test_multisource(<32 x i16>* %B) nounwind {
; CHECK-NEXT: vorr d23, d20, d20
; CHECK-NEXT: vldr d22, [r0]
; CHECK-NEXT: vzip.16 d23, d22
-; CHECK-NEXT: vtrn.16 q8, q9
+; CHECK-NEXT: vtrn.16 d16, d18
; CHECK-NEXT: vext.16 d18, d20, d23, #2
; CHECK-NEXT: vext.16 d16, d18, d16, #2
; CHECK-NEXT: vext.16 d16, d16, d16, #2
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
index 7fc8812def9eb..79eee6798a58d 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -659,8 +659,8 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind
;
; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2
; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -668,8 +668,8 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind
;
; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
; XOPAVX2-NEXT: vprotq %xmm1, %xmm2, %xmm2
; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
@@ -754,8 +754,8 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind
;
; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; XOPAVX1-NEXT: vprotd %xmm1, %xmm2, %xmm2
; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -763,8 +763,8 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind
;
; XOPAVX2-LABEL: splatvar_funnnel_v8i32:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
; XOPAVX2-NEXT: vprotd %xmm1, %xmm2, %xmm2
; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
@@ -868,9 +868,9 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
;
; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2
; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -878,8 +878,8 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
;
; XOPAVX2-LABEL: splatvar_funnnel_v16i16:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1
; XOPAVX2-NEXT: vprotw %xmm1, %xmm2, %xmm2
; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
@@ -1018,8 +1018,8 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
;
; XOPAVX2-LABEL: splatvar_funnnel_v32i8:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; XOPAVX2-NEXT: vprotb %xmm1, %xmm2, %xmm2
; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index 4f3e9100d7944..430e06a233a26 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -700,9 +700,9 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind
;
; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2
; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
@@ -727,7 +727,6 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind
define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -747,7 +746,6 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind
;
; AVX2-LABEL: splatvar_funnnel_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
@@ -832,8 +830,6 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind
define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -853,7 +849,6 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
;
; AVX2-LABEL: splatvar_funnnel_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -868,7 +863,6 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
;
; AVX512F-LABEL: splatvar_funnnel_v16i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpsubw %xmm1, %xmm2, %xmm1
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -883,7 +877,6 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
;
; AVX512VL-LABEL: splatvar_funnnel_v16i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vpsubw %xmm1, %xmm2, %xmm1
; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -898,7 +891,6 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
;
; AVX512BW-LABEL: splatvar_funnnel_v16i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512BW-NEXT: vpsubw %xmm1, %xmm2, %xmm1
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -913,7 +905,6 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
;
; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm2, %xmm1
; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -942,10 +933,10 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
;
; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2
; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
index ee1465877a4f2..4a20405790a23 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
@@ -452,7 +452,6 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounw
define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpsubw %xmm1, %xmm2, %xmm1
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -472,7 +471,6 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
;
; AVX512VL-LABEL: splatvar_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vpsubw %xmm1, %xmm2, %xmm1
; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index bc82f32007ddd..5cf1fe1c9e560 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -634,8 +634,8 @@ define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
;
; XOPAVX1-LABEL: splatvar_rotate_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2
; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -643,8 +643,8 @@ define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
;
; XOPAVX2-LABEL: splatvar_rotate_v4i64:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
; XOPAVX2-NEXT: vprotq %xmm1, %xmm2, %xmm2
; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
@@ -732,8 +732,8 @@ define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
;
; XOPAVX1-LABEL: splatvar_rotate_v8i32:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; XOPAVX1-NEXT: vprotd %xmm1, %xmm2, %xmm2
; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -741,8 +741,8 @@ define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
;
; XOPAVX2-LABEL: splatvar_rotate_v8i32:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
; XOPAVX2-NEXT: vprotd %xmm1, %xmm2, %xmm2
; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
@@ -849,9 +849,9 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
;
; XOPAVX1-LABEL: splatvar_rotate_v16i16:
; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2
; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -859,8 +859,8 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
;
; XOPAVX2-LABEL: splatvar_rotate_v16i16:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1
; XOPAVX2-NEXT: vprotw %xmm1, %xmm2, %xmm2
; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
@@ -998,8 +998,8 @@ define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
;
; XOPAVX2-LABEL: splatvar_rotate_v32i8:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; XOPAVX2-NEXT: vprotb %xmm1, %xmm2, %xmm2
; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
index a9bcc73f97fe1..9cb4ff8cffdaa 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -677,9 +677,9 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
;
; XOPAVX1-LABEL: splatvar_shift_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vpshaq %xmm1, %xmm2, %xmm2
; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0
@@ -1033,9 +1033,9 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi
; XOPAVX1-LABEL: splatvar_modulo_shift_v4i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vpshaq %xmm1, %xmm2, %xmm2
; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0
@@ -1322,8 +1322,8 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
;
; AVX512BW-LABEL: splatvar_modulo_shift_v32i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
@@ -1346,8 +1346,8 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
;
; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0
+; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BWVL-NEXT: vpsraw %xmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
index 44256db3979df..6a0c479ec2399 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -1095,8 +1095,8 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
;
; AVX512BW-LABEL: splatvar_modulo_shift_v32i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
@@ -1116,8 +1116,8 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
;
; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BWVL-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
index fd0ee8f051543..d6aac50ffdbc2 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -669,8 +669,8 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
;
; XOPAVX2-LABEL: splatvar_shift_v32i8:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; XOPAVX2-NEXT: vpshlb %xmm1, %xmm2, %xmm2
; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
@@ -995,9 +995,9 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
;
; XOPAVX2-LABEL: splatvar_modulo_shift_v32i8:
; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; XOPAVX2-NEXT: vpshlb %xmm1, %xmm2, %xmm2
; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
@@ -1016,8 +1016,8 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
;
; AVX512BW-LABEL: splatvar_modulo_shift_v32i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
@@ -1036,8 +1036,8 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
;
; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BWVL-NEXT: vpsllw %xmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index c035d7d2073a7..649ff574fb33b 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -4,8 +4,8 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
@@ -1902,25 +1902,11 @@ define <8 x i16> @PR32160(<8 x i32> %x) {
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
; SSE-NEXT: retq
;
-; AVX1-LABEL: PR32160:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: PR32160:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
-; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: PR32160:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9]
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
+; AVX-LABEL: PR32160:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9]
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
;
; AVX512F-LABEL: PR32160:
; AVX512F: # %bb.0:
More information about the llvm-commits
mailing list