[llvm] 62e36b1 - [X86] canLowerByDroppingEvenElements - generalize to drop even or odd elements
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 18 07:07:50 PST 2022
Author: Simon Pilgrim
Date: 2022-01-18T15:07:24Z
New Revision: 62e36b1207497c4f7e1191a8d5407f4578c686e5
URL: https://github.com/llvm/llvm-project/commit/62e36b1207497c4f7e1191a8d5407f4578c686e5
DIFF: https://github.com/llvm/llvm-project/commit/62e36b1207497c4f7e1191a8d5407f4578c686e5.diff
LOG: [X86] canLowerByDroppingEvenElements - generalize to drop even or odd elements
This allows us to match shuffle<1,3,5,7,9,11,13,15> style shift+trunc/pack patterns as well as the existing shuffle<0,2,4,6,8,10,12,14> style shuffle trunc/pack patterns
In the future, interleaving patterns might benefit from an even more general implementation for higher strides
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/haddsub-shuf.ll
llvm/test/CodeGen/X86/phaddsub.ll
llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
llvm/test/CodeGen/X86/x86-interleaved-access.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bb48cd1d5362c..a0e1f4aa6ca47 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -12177,12 +12177,13 @@ static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
return SDValue();
}
-/// Check whether a compaction lowering can be done by dropping even
-/// elements and compute how many times even elements must be dropped.
+/// Check whether a compaction lowering can be done by dropping even/odd
+/// elements and compute how many times even/odd elements must be dropped.
///
/// This handles shuffles which take every Nth element where N is a power of
/// two. Example shuffle masks:
///
+/// (even)
/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
@@ -12190,16 +12191,20 @@ static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
///
+/// (odd)
+/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
+/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+///
/// Any of these lanes can of course be undef.
///
/// This routine only supports N <= 3.
/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
/// for larger N.
///
-/// \returns N above, or the number of times even elements must be dropped if
-/// there is such a number. Otherwise returns zero.
-static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
- bool IsSingleInput) {
+/// \returns N above, or the number of times even/odd elements must be dropped
+/// if there is such a number. Otherwise returns zero.
+static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
+ bool IsSingleInput) {
// The modulus for the shuffle vector entries is based on whether this is
// a single input or not.
int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
@@ -12207,6 +12212,7 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
"We should only be called with masks with a power-of-2 size!");
uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
+ int Offset = MatchEven ? 0 : 1;
// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
// and 2^3 simultaneously. This is because we may have ambiguity with
@@ -12225,7 +12231,7 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
uint64_t N = j + 1;
// The shuffle mask must be equal to (i * 2^N) % M.
- if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
+ if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
IsAnyViable = true;
else
ViableForN[j] = false;
@@ -15739,7 +15745,7 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
// We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
// be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
- int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false);
+ int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
!Subtarget.hasVLX()) {
// Check if this is part of a 256-bit vector truncation.
@@ -15773,6 +15779,20 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Result;
}
+ // When compacting odd (upper) elements, use PACKSS pre-SSE41.
+ int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
+ if (NumOddDrops == 1) {
+ bool HasSSE41 = Subtarget.hasSSE41();
+ V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
+ DAG.getBitcast(MVT::v4i32, V1),
+ DAG.getTargetConstant(16, DL, MVT::i8));
+ V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
+ DAG.getBitcast(MVT::v4i32, V2),
+ DAG.getTargetConstant(16, DL, MVT::i8));
+ return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
+ MVT::v8i16, V1, V2);
+ }
+
// Try to lower by permuting the inputs into an unpack instruction.
if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
Mask, Subtarget, DAG))
@@ -16039,7 +16059,7 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Check for compaction patterns.
bool IsSingleInput = V2.isUndef();
- int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput);
+ int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
// with PSHUFB. It is important to do this before we attempt to generate any
@@ -16150,6 +16170,19 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Result;
}
+ int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
+ if (NumOddDrops == 1) {
+ V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
+ DAG.getBitcast(MVT::v8i16, V1),
+ DAG.getTargetConstant(8, DL, MVT::i8));
+ if (!IsSingleInput)
+ V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
+ DAG.getBitcast(MVT::v8i16, V2),
+ DAG.getTargetConstant(8, DL, MVT::i8));
+ return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
+ IsSingleInput ? V1 : V2);
+ }
+
// Handle multi-input cases by blending/unpacking single-input shuffles.
if (NumV2Elements > 0)
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll
index 06f36146baae5..2298774142981 100644
--- a/llvm/test/CodeGen/X86/haddsub-shuf.ll
+++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll
@@ -717,22 +717,17 @@ define <16 x i16> @hadd_v16i16a(<16 x i16> %a) {
; SSE3: # %bb.0:
; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
+; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7]
-; SSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
-; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,0,3,2,4,5,6,7]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,0,3,2,4,5,6,7]
-; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE3-NEXT: paddw %xmm3, %xmm1
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
+; SSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE3-NEXT: psrad $16, %xmm1
+; SSE3-NEXT: psrad $16, %xmm0
+; SSE3-NEXT: packssdw %xmm1, %xmm0
+; SSE3-NEXT: paddw %xmm0, %xmm2
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
+; SSE3-NEXT: movdqa %xmm2, %xmm1
; SSE3-NEXT: retq
;
; SSSE3_SLOW-LABEL: hadd_v16i16a:
@@ -867,15 +862,9 @@ define <16 x i16> @hsub_v16i16a(<16 x i16> %a) {
; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
-; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
-; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE3-NEXT: psrad $16, %xmm1
+; SSE3-NEXT: psrad $16, %xmm0
+; SSE3-NEXT: packssdw %xmm1, %xmm0
; SSE3-NEXT: psubw %xmm0, %xmm2
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
; SSE3-NEXT: movdqa %xmm2, %xmm1
@@ -1380,39 +1369,27 @@ define <16 x i16> @hadd_16i16_16i16_shuffle(<16 x i16> %a0, <16 x i16> %a1) {
; SSE3: # %bb.0:
; SSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,2,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
+; SSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,2,2,3]
+; SSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7]
+; SSE3-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,2,2,3,4,5,6,7]
+; SSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE3-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; SSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7]
-; SSE3-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE3-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm4[0]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
-; SSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
-; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
-; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
-; SSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE3-NEXT: paddw %xmm5, %xmm2
-; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
-; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
-; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0]
+; SSE3-NEXT: psrad $16, %xmm3
+; SSE3-NEXT: psrad $16, %xmm2
+; SSE3-NEXT: packssdw %xmm3, %xmm2
+; SSE3-NEXT: paddw %xmm2, %xmm4
+; SSE3-NEXT: psrad $16, %xmm1
+; SSE3-NEXT: psrad $16, %xmm0
+; SSE3-NEXT: packssdw %xmm1, %xmm0
; SSE3-NEXT: paddw %xmm6, %xmm0
-; SSE3-NEXT: movdqa %xmm2, %xmm1
+; SSE3-NEXT: movdqa %xmm4, %xmm1
; SSE3-NEXT: retq
;
; SSSE3-LABEL: hadd_16i16_16i16_shuffle:
diff --git a/llvm/test/CodeGen/X86/phaddsub.ll b/llvm/test/CodeGen/X86/phaddsub.ll
index 139e5c8ee8421..12c7a092d6c18 100644
--- a/llvm/test/CodeGen/X86/phaddsub.ll
+++ b/llvm/test/CodeGen/X86/phaddsub.ll
@@ -271,12 +271,11 @@ define <4 x i32> @phsubd4(<4 x i32> %x) {
define <8 x i16> @phsubw1_reverse(<8 x i16> %x, <8 x i16> %y) {
; SSSE3-LABEL: phsubw1_reverse:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: pshufb %xmm3, %xmm4
+; SSSE3-NEXT: movdqa %xmm1, %xmm3
+; SSSE3-NEXT: psrad $16, %xmm3
; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pshufb %xmm3, %xmm2
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; SSSE3-NEXT: psrad $16, %xmm2
+; SSSE3-NEXT: packssdw %xmm3, %xmm2
; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSSE3-NEXT: pshufb %xmm3, %xmm1
; SSSE3-NEXT: pshufb %xmm3, %xmm0
@@ -287,10 +286,9 @@ define <8 x i16> @phsubw1_reverse(<8 x i16> %x, <8 x i16> %y) {
;
; AVX-LABEL: phsubw1_reverse:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm3
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT: vpsrld $16, %xmm1, %xmm2
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm3
+; AVX-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
index fa1f1c9248afc..dada2dc83a708 100644
--- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
@@ -44,10 +44,9 @@ define void @shuffle_v16i16_to_v8i16_1(<16 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, (%rsi)
; AVX-NEXT: retq
;
@@ -55,40 +54,36 @@ define void @shuffle_v16i16_to_v8i16_1(<16 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_to_v8i16_1:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vpsrld $16, 16(%rdi), %xmm0
+; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm1
+; AVX512VL-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v8i16_1:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [1,3,5,7,33,35,37,39]
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
-; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
-; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16_1:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7,9,11,13,15]
-; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX512BWVL-NEXT: vpsrld $16, 16(%rdi), %xmm0
+; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm1
+; AVX512BWVL-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
+; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
index 66491b864e9ad..3a371a4241046 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
@@ -101,15 +101,9 @@ define void @vf8(<16 x i16>* %in.vec, <8 x i16>* %out.vec0, <8 x i16>* %out.vec1
; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm3, (%rsi)
; SSE-NEXT: movdqa %xmm0, (%rdx)
; SSE-NEXT: retq
@@ -122,10 +116,9 @@ define void @vf8(<16 x i16>* %in.vec, <8 x i16>* %out.vec0, <8 x i16>* %out.vec1
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
; AVX-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
-; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX-NEXT: vpsrld $16, %xmm2, %xmm2
+; AVX-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX-NEXT: vmovdqa %xmm0, (%rsi)
; AVX-NEXT: vmovdqa %xmm1, (%rdx)
; AVX-NEXT: retq
@@ -133,11 +126,11 @@ define void @vf8(<16 x i16>* %in.vec, <8 x i16>* %out.vec0, <8 x i16>* %out.vec1
; AVX512-LABEL: vf8:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15]
-; AVX512-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm2
+; AVX512-NEXT: vpsrld $16, 16(%rdi), %xmm1
+; AVX512-NEXT: vpsrld $16, (%rdi), %xmm2
+; AVX512-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
; AVX512-NEXT: vpmovdw %ymm0, (%rsi)
-; AVX512-NEXT: vmovdqa %xmm2, (%rdx)
+; AVX512-NEXT: vmovdqa %xmm1, (%rdx)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%wide.vec = load <16 x i16>, <16 x i16>* %in.vec, align 32
@@ -155,45 +148,33 @@ define void @vf16(<32 x i16>* %in.vec, <16 x i16>* %out.vec0, <16 x i16>* %out.v
; SSE-LABEL: vf16:
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm0
-; SSE-NEXT: movdqa 16(%rdi), %xmm1
-; SSE-NEXT: movdqa 32(%rdi), %xmm2
+; SSE-NEXT: movdqa 16(%rdi), %xmm2
+; SSE-NEXT: movdqa 32(%rdi), %xmm1
; SSE-NEXT: movdqa 48(%rdi), %xmm3
; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm4[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: psrad $16, %xmm3
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: packssdw %xmm3, %xmm1
+; SSE-NEXT: psrad $16, %xmm2
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm2, %xmm0
; SSE-NEXT: movdqa %xmm6, (%rsi)
; SSE-NEXT: movdqa %xmm5, 16(%rsi)
; SSE-NEXT: movdqa %xmm0, (%rdx)
-; SSE-NEXT: movdqa %xmm2, 16(%rdx)
+; SSE-NEXT: movdqa %xmm1, 16(%rdx)
; SSE-NEXT: retq
;
; AVX1-LABEL: vf16:
@@ -209,18 +190,16 @@ define void @vf16(<32 x i16>* %in.vec, <16 x i16>* %out.vec0, <16 x i16>* %out.v
; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
; AVX1-NEXT: vpackusdw %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vpsrld $16, %xmm4, %xmm4
+; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3
+; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
+; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
; AVX1-NEXT: vmovdqa %xmm5, 16(%rsi)
-; AVX1-NEXT: vmovaps %ymm1, (%rdx)
-; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 16(%rdx)
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: vf16:
@@ -284,133 +263,107 @@ define void @vf16(<32 x i16>* %in.vec, <16 x i16>* %out.vec0, <16 x i16>* %out.v
define void @vf32(<64 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.vec1) nounwind {
; SSE-LABEL: vf32:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa 64(%rdi), %xmm8
-; SSE-NEXT: movdqa 80(%rdi), %xmm9
-; SSE-NEXT: movdqa 96(%rdi), %xmm12
-; SSE-NEXT: movdqa 112(%rdi), %xmm0
-; SSE-NEXT: movdqa (%rdi), %xmm10
+; SSE-NEXT: movdqa 64(%rdi), %xmm1
+; SSE-NEXT: movdqa 80(%rdi), %xmm10
+; SSE-NEXT: movdqa 96(%rdi), %xmm0
+; SSE-NEXT: movdqa 112(%rdi), %xmm7
+; SSE-NEXT: movdqa (%rdi), %xmm3
; SSE-NEXT: movdqa 16(%rdi), %xmm11
-; SSE-NEXT: movdqa 32(%rdi), %xmm4
+; SSE-NEXT: movdqa 32(%rdi), %xmm2
; SSE-NEXT: movdqa 48(%rdi), %xmm5
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm13 = xmm13[0],xmm2[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm2[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm12[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm4[0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm4[0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm6[0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movdqa %xmm2, 32(%rsi)
-; SSE-NEXT: movdqa %xmm7, (%rsi)
-; SSE-NEXT: movdqa %xmm3, 48(%rsi)
-; SSE-NEXT: movdqa %xmm13, 16(%rsi)
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; SSE-NEXT: psrad $16, %xmm5
+; SSE-NEXT: psrad $16, %xmm2
+; SSE-NEXT: packssdw %xmm5, %xmm2
+; SSE-NEXT: psrad $16, %xmm7
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm7, %xmm0
+; SSE-NEXT: psrad $16, %xmm11
+; SSE-NEXT: psrad $16, %xmm3
+; SSE-NEXT: packssdw %xmm11, %xmm3
+; SSE-NEXT: psrad $16, %xmm10
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: packssdw %xmm10, %xmm1
+; SSE-NEXT: movdqa %xmm4, 32(%rsi)
+; SSE-NEXT: movdqa %xmm12, (%rsi)
+; SSE-NEXT: movdqa %xmm9, 48(%rsi)
+; SSE-NEXT: movdqa %xmm8, 16(%rsi)
; SSE-NEXT: movdqa %xmm1, 32(%rdx)
-; SSE-NEXT: movdqa %xmm6, (%rdx)
-; SSE-NEXT: movdqa %xmm5, 48(%rdx)
-; SSE-NEXT: movdqa %xmm4, 16(%rdx)
+; SSE-NEXT: movdqa %xmm3, (%rdx)
+; SSE-NEXT: movdqa %xmm0, 48(%rdx)
+; SSE-NEXT: movdqa %xmm2, 16(%rdx)
; SSE-NEXT: retq
;
; AVX1-LABEL: vf32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa 112(%rdi), %xmm10
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm1[1],xmm10[2],xmm1[3],xmm10[4],xmm1[5],xmm10[6],xmm1[7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0],xmm0[1],xmm10[2],xmm0[3],xmm10[4],xmm0[5],xmm10[6],xmm0[7]
; AVX1-NEXT: vmovdqa 96(%rdi), %xmm11
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm1[1],xmm11[2],xmm1[3],xmm11[4],xmm1[5],xmm11[6],xmm1[7]
-; AVX1-NEXT: vpackusdw %xmm0, %xmm4, %xmm8
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm0[1],xmm11[2],xmm0[3],xmm11[4],xmm0[5],xmm11[6],xmm0[7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm8
; AVX1-NEXT: vmovdqa 80(%rdi), %xmm12
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm1[1],xmm12[2],xmm1[3],xmm12[4],xmm1[5],xmm12[6],xmm1[7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm0[1],xmm12[2],xmm0[3],xmm12[4],xmm0[5],xmm12[6],xmm0[7]
; AVX1-NEXT: vmovdqa 64(%rdi), %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm1[1],xmm6[2],xmm1[3],xmm6[4],xmm1[5],xmm6[6],xmm1[7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm0[1],xmm6[2],xmm0[3],xmm6[4],xmm0[5],xmm6[6],xmm0[7]
; AVX1-NEXT: vpackusdw %xmm5, %xmm7, %xmm9
; AVX1-NEXT: vmovdqa (%rdi), %xmm7
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm5
-; AVX1-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4],xmm1[5],xmm5[6],xmm1[7]
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm0[1],xmm5[2],xmm0[3],xmm5[4],xmm0[5],xmm5[6],xmm0[7]
; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2],xmm1[3],xmm7[4],xmm1[5],xmm7[6],xmm1[7]
-; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
-; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm4, %xmm7, %xmm5
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm5[0],xmm0[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vpshufb %xmm4, %xmm10, %xmm2
-; AVX1-NEXT: vpshufb %xmm4, %xmm11, %xmm5
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
-; AVX1-NEXT: vpshufb %xmm4, %xmm12, %xmm5
-; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
-; AVX1-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3],xmm7[4],xmm0[5],xmm7[6],xmm0[7]
+; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX1-NEXT: vpsrld $16, %xmm5, %xmm4
+; AVX1-NEXT: vpackusdw %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
+; AVX1-NEXT: vpsrld $16, %xmm7, %xmm4
+; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpsrld $16, %xmm10, %xmm4
+; AVX1-NEXT: vpsrld $16, %xmm11, %xmm5
+; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpsrld $16, %xmm12, %xmm5
+; AVX1-NEXT: vpsrld $16, %xmm6, %xmm6
+; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
; AVX1-NEXT: vmovdqa %xmm3, 16(%rsi)
; AVX1-NEXT: vmovdqa %xmm9, 32(%rsi)
; AVX1-NEXT: vmovdqa %xmm8, 48(%rsi)
-; AVX1-NEXT: vmovaps %ymm2, 32(%rdx)
-; AVX1-NEXT: vmovaps %ymm0, (%rdx)
-; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: vmovdqa %xmm5, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm4, 48(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: vf32:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
index 8baf216591dd0..c510756848067 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -2958,54 +2958,38 @@ define <8 x i16> @shuffle_v8i16_02468ace(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_13579bdf(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_13579bdf:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_13579bdf:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
-; SSSE3-NEXT: pshufb %xmm2, %xmm1
-; SSSE3-NEXT: pshufb %xmm2, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: psrad $16, %xmm1
+; SSSE3-NEXT: psrad $16, %xmm0
+; SSSE3-NEXT: packssdw %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_13579bdf:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
-; SSE41-NEXT: pshufb %xmm2, %xmm1
-; SSE41-NEXT: pshufb %xmm2, %xmm0
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v8i16_13579bdf:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i16_13579bdf:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_13579bdf:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15]
-; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i16_13579bdf:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX2OR512VL-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX2OR512VL-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2OR512VL-NEXT: retq
;
; XOP-LABEL: shuffle_v8i16_13579bdf:
; XOP: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index 127972d9ce710..dce96d4941c0f 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -432,17 +432,14 @@ define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) {
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[1,0,3,2,4,5,6,7]
-; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <3,u,7,u,11,u,15,u,7,u,15,u,6,u,7,u>
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <3,u,1,u,7,u,5,u,1,u,5,u,0,u,1,u>
-; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpmullw %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX1-NEXT: retq
@@ -460,17 +457,14 @@ define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) {
; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vpaddb %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[1,0,3,2,4,5,6,7]
-; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <3,u,7,u,11,u,15,u,7,u,15,u,6,u,7,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm4
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm3
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <3,u,1,u,7,u,5,u,1,u,5,u,0,u,1,u>
-; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpaddb %xmm0, %xmm3, %xmm0
; AVX2-NEXT: vpmullw %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX2-NEXT: retq
@@ -479,16 +473,14 @@ define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) {
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <3,u,7,u,11,u,15,u,7,u,15,u,6,u,7,u>
; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm3
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[1,0,3,2,4,5,6,7]
; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm2
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[1,0,3,2,4,5,6,7]
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm5[0],xmm4[0]
-; AVX512-NEXT: vprold $8, %zmm3, %zmm3
-; AVX512-NEXT: vprold $8, %zmm2, %zmm2
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512-NEXT: vpaddb %xmm4, %xmm2, %xmm2
+; AVX512-NEXT: vpsrld $16, %xmm1, %xmm3
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm4
+; AVX512-NEXT: vpackusdw %xmm3, %xmm4, %xmm3
+; AVX512-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,u,5,u,9,u,13,u,13,u,5,u,12,u,13,u>
; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
More information about the llvm-commits
mailing list