[llvm-branch-commits] [llvm] Add DoNotPoisonEltMask to several SimplifyDemanded function in TargetLowering (PR #145903)
Björn Pettersson via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Nov 10 02:37:34 PST 2025
https://github.com/bjope updated https://github.com/llvm/llvm-project/pull/145903
>From b3279628d4311413c2a3186b64377ca64afa6b80 Mon Sep 17 00:00:00 2001
From: Bjorn Pettersson <bjorn.a.pettersson at ericsson.com>
Date: Thu, 6 Nov 2025 23:59:30 +0100
Subject: [PATCH 1/4] [SelectionDAG] Add DoNotPoisonEltMask to
SimplifyDemandedVectorElts
The fix for #138513 resulted in a number of regressions due to
the need to demand elements corresponding to bits used by bitcasts
even if those bits weren't used. Problem was that if we did not
demand those elements the calls to SimplifyDemandedVectorElts
could end up turning those unused elements in to poison, making
the bitcast result poison.
This patch is trying to avoid such regressions by adding a new
element mask ('DoNotPoisonEltMask') to SimplifyDemandedVectorElts
that identify elements that aren't really demanded, but they must
not be made more poisonous during simplifications.
---
llvm/include/llvm/CodeGen/TargetLowering.h | 9 +
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 +-
.../CodeGen/SelectionDAG/TargetLowering.cpp | 272 ++--
.../AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll | 36 +-
llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 957 ++++++-------
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 500 ++++---
llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 1038 +++++++-------
llvm/test/CodeGen/AMDGPU/load-global-i8.ll | 1230 ++++++++---------
llvm/test/CodeGen/ARM/fpclamptosat_vec.ll | 444 +++---
.../CodeGen/Thumb2/mve-fpclamptosat_vec.ll | 80 +-
.../Thumb2/mve-gather-ind8-unscaled.ll | 5 +
.../CodeGen/Thumb2/mve-laneinterleaving.ll | 86 +-
llvm/test/CodeGen/Thumb2/mve-pred-ext.ll | 1 -
llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll | 213 ++-
.../Thumb2/mve-scatter-ind8-unscaled.ll | 9 +-
.../CodeGen/Thumb2/mve-vecreduce-addpred.ll | 8 +-
.../CodeGen/Thumb2/mve-vecreduce-mlapred.ll | 8 +-
.../CodeGen/X86/avx512-intrinsics-upgrade.ll | 46 +-
.../X86/avx512vl-intrinsics-upgrade.ll | 52 +-
.../CodeGen/X86/avx512vl-vec-masked-cmp.ll | 90 +-
.../CodeGen/X86/buildvec-widen-dotproduct.ll | 12 +-
llvm/test/CodeGen/X86/combine-pmuldq.ll | 67 +-
.../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 24 +-
llvm/test/CodeGen/X86/gfni-funnel-shifts.ll | 300 ++--
...ist-and-by-const-from-shl-in-eqcmp-zero.ll | 48 +-
llvm/test/CodeGen/X86/known-never-zero.ll | 3 +-
llvm/test/CodeGen/X86/known-signbits-shl.ll | 3 +-
.../test/CodeGen/X86/known-signbits-vector.ll | 30 +-
...of-two-or-zero-when-comparing-with-zero.ll | 4 +-
llvm/test/CodeGen/X86/pmulh.ll | 24 +-
llvm/test/CodeGen/X86/pr42727.ll | 2 +-
.../test/CodeGen/X86/rotate-extract-vector.ll | 2 -
llvm/test/CodeGen/X86/shrink_vmul.ll | 32 +-
.../CodeGen/X86/srem-seteq-vec-nonsplat.ll | 58 +-
llvm/test/CodeGen/X86/sshl_sat_vec.ll | 76 +-
.../CodeGen/X86/urem-seteq-illegal-types.ll | 10 +-
.../CodeGen/X86/urem-seteq-vec-nonsplat.ll | 30 +-
llvm/test/CodeGen/X86/ushl_sat_vec.ll | 29 +-
llvm/test/CodeGen/X86/vec_smulo.ll | 70 +-
llvm/test/CodeGen/X86/vec_umulo.ll | 130 +-
llvm/test/CodeGen/X86/vector-fshl-128.ll | 256 ++--
llvm/test/CodeGen/X86/vector-fshl-256.ll | 95 +-
llvm/test/CodeGen/X86/vector-fshl-rot-128.ll | 52 +-
llvm/test/CodeGen/X86/vector-fshl-rot-256.ll | 29 +-
llvm/test/CodeGen/X86/vector-fshr-128.ll | 58 +-
llvm/test/CodeGen/X86/vector-fshr-256.ll | 43 +-
llvm/test/CodeGen/X86/vector-fshr-rot-128.ll | 55 +-
llvm/test/CodeGen/X86/vector-fshr-rot-256.ll | 4 +-
llvm/test/CodeGen/X86/vector-mul.ll | 72 +-
llvm/test/CodeGen/X86/vector-rotate-128.ll | 52 +-
llvm/test/CodeGen/X86/vector-rotate-256.ll | 29 +-
llvm/test/CodeGen/X86/vector-shift-shl-128.ll | 52 +-
llvm/test/CodeGen/X86/vector-shift-shl-256.ll | 42 +-
.../CodeGen/X86/vector-shift-shl-sub128.ll | 104 +-
...vector_splat-const-shift-of-constmasked.ll | 9 +-
55 files changed, 3316 insertions(+), 3680 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 98565f423df3e..c8ffdb3592e6b 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4263,6 +4263,15 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
/// results of this function, because simply replacing TLO.Old
/// with TLO.New will be incorrect when this parameter is true and TLO.Old
/// has multiple uses.
+ /// Vector elements that aren't demanded can be turned into poison unless the
+ /// corresponding bit in \p DoNotPoisonEltMask is set.
+ bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask,
+ const APInt &DoNotPoisonEltMask,
+ APInt &KnownUndef, APInt &KnownZero,
+ TargetLoweringOpt &TLO, unsigned Depth = 0,
+ bool AssumeSingleUse = false) const;
+ /// Version of SimplifyDemandedVectorElts without the DoNotPoisonEltMask
+ /// argument. All undemanded elements can be turned into poison.
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask,
APInt &KnownUndef, APInt &KnownZero,
TargetLoweringOpt &TLO, unsigned Depth = 0,
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f144f17d5a8f2..6167372127981 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1466,8 +1466,10 @@ bool DAGCombiner::SimplifyDemandedVectorElts(SDValue Op,
bool AssumeSingleUse) {
TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
APInt KnownUndef, KnownZero;
- if (!TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
- TLO, 0, AssumeSingleUse))
+ APInt DoNotPoisonElts = APInt::getZero(DemandedElts.getBitWidth());
+ if (!TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DoNotPoisonElts,
+ KnownUndef, KnownZero, TLO, 0,
+ AssumeSingleUse))
return false;
// Revisit the node.
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 20ef11fc0475b..6369bf023e1c0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2773,23 +2773,35 @@ bool TargetLowering::SimplifyDemandedBits(
unsigned Scale = BitWidth / NumSrcEltBits;
unsigned NumSrcElts = SrcVT.getVectorNumElements();
APInt DemandedSrcBits = APInt::getZero(NumSrcEltBits);
+ APInt DemandedSrcElts = APInt::getZero(NumSrcElts);
for (unsigned i = 0; i != Scale; ++i) {
unsigned EltOffset = IsLE ? i : (Scale - 1 - i);
unsigned BitOffset = EltOffset * NumSrcEltBits;
- DemandedSrcBits |= DemandedBits.extractBits(NumSrcEltBits, BitOffset);
+ APInt Sub = DemandedBits.extractBits(NumSrcEltBits, BitOffset);
+ if (!Sub.isZero()) {
+ DemandedSrcBits |= Sub;
+ for (unsigned j = 0; j != NumElts; ++j)
+ if (DemandedElts[j])
+ DemandedSrcElts.setBit((j * Scale) + i);
+ }
}
- // Recursive calls below may turn not demanded elements into poison, so we
- // need to demand all smaller source elements that maps to a demanded
- // destination element.
- APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
+ // Need to "semi demand" all smaller source elements that maps to a
+ // demanded destination element, since recursive calls below may turn not
+ // demanded elements into poison. Instead of demanding such elements we
+ // use a special bitmask to indicate that the recursive calls must not
+ // turn such elements into poison.
+ APInt DoNotPoisonSrcElts = APIntOps::ScaleBitMask(DemandedElts,
+ NumSrcElts);
APInt KnownSrcUndef, KnownSrcZero;
- if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, KnownSrcUndef,
- KnownSrcZero, TLO, Depth + 1))
+ if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, DoNotPoisonSrcElts,
+ KnownSrcUndef, KnownSrcZero, TLO,
+ Depth + 1))
return true;
KnownBits KnownSrcBits;
- if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts,
+ if (SimplifyDemandedBits(Src, DemandedSrcBits,
+ DemandedSrcElts | DoNotPoisonSrcElts,
KnownSrcBits, TLO, Depth + 1))
return true;
} else if (IsLE && (NumSrcEltBits % BitWidth) == 0) {
@@ -2805,10 +2817,12 @@ bool TargetLowering::SimplifyDemandedBits(
DemandedSrcElts.setBit(i / Scale);
}
+ APInt DoNotPoisonEltMask = APInt::getZero(NumSrcElts);
+
if (SrcVT.isVector()) {
APInt KnownSrcUndef, KnownSrcZero;
- if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, KnownSrcUndef,
- KnownSrcZero, TLO, Depth + 1))
+ if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, DoNotPoisonEltMask,
+ KnownSrcUndef, KnownSrcZero, TLO, Depth + 1))
return true;
}
@@ -3103,8 +3117,9 @@ bool TargetLowering::SimplifyDemandedVectorElts(SDValue Op,
!DCI.isBeforeLegalizeOps());
APInt KnownUndef, KnownZero;
- bool Simplified =
- SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, TLO);
+ APInt DoNotPoisonEltMask = APInt::getZero(DemandedElts.getBitWidth());
+ bool Simplified = SimplifyDemandedVectorElts(
+ Op, DemandedElts, DoNotPoisonEltMask, KnownUndef, KnownZero, TLO);
if (Simplified) {
DCI.AddToWorklist(Op.getNode());
DCI.CommitTargetLoweringOpt(TLO);
@@ -3165,6 +3180,16 @@ bool TargetLowering::SimplifyDemandedVectorElts(
SDValue Op, const APInt &OriginalDemandedElts, APInt &KnownUndef,
APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth,
bool AssumeSingleUse) const {
+ APInt DoNotPoisonEltMask = APInt::getZero(OriginalDemandedElts.getBitWidth());
+ return SimplifyDemandedVectorElts(Op, OriginalDemandedElts,
+ DoNotPoisonEltMask, KnownUndef, KnownZero,
+ TLO, Depth, AssumeSingleUse);
+}
+
+bool TargetLowering::SimplifyDemandedVectorElts(
+ SDValue Op, const APInt &OriginalDemandedElts,
+ const APInt &DoNotPoisonEltMask, APInt &KnownUndef, APInt &KnownZero,
+ TargetLoweringOpt &TLO, unsigned Depth, bool AssumeSingleUse) const {
EVT VT = Op.getValueType();
unsigned Opcode = Op.getOpcode();
APInt DemandedElts = OriginalDemandedElts;
@@ -3203,6 +3228,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
if (Depth >= SelectionDAG::MaxRecursionDepth)
return false;
+ APInt DemandedEltsInclDoNotPoison = DemandedElts | DoNotPoisonEltMask;
SDLoc DL(Op);
unsigned EltSizeInBits = VT.getScalarSizeInBits();
bool IsLE = TLO.DAG.getDataLayout().isLittleEndian();
@@ -3210,10 +3236,10 @@ bool TargetLowering::SimplifyDemandedVectorElts(
// Helper for demanding the specified elements and all the bits of both binary
// operands.
auto SimplifyDemandedVectorEltsBinOp = [&](SDValue Op0, SDValue Op1) {
- SDValue NewOp0 = SimplifyMultipleUseDemandedVectorElts(Op0, DemandedElts,
- TLO.DAG, Depth + 1);
- SDValue NewOp1 = SimplifyMultipleUseDemandedVectorElts(Op1, DemandedElts,
- TLO.DAG, Depth + 1);
+ SDValue NewOp0 = SimplifyMultipleUseDemandedVectorElts(
+ Op0, DemandedEltsInclDoNotPoison, TLO.DAG, Depth + 1);
+ SDValue NewOp1 = SimplifyMultipleUseDemandedVectorElts(
+ Op1, DemandedEltsInclDoNotPoison, TLO.DAG, Depth + 1);
if (NewOp0 || NewOp1) {
SDValue NewOp =
TLO.DAG.getNode(Opcode, SDLoc(Op), VT, NewOp0 ? NewOp0 : Op0,
@@ -3257,17 +3283,20 @@ bool TargetLowering::SimplifyDemandedVectorElts(
// Fast handling of 'identity' bitcasts.
unsigned NumSrcElts = SrcVT.getVectorNumElements();
if (NumSrcElts == NumElts)
- return SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef,
- KnownZero, TLO, Depth + 1);
+ return SimplifyDemandedVectorElts(Src, DemandedElts, DoNotPoisonEltMask,
+ KnownUndef, KnownZero, TLO, Depth + 1);
- APInt SrcDemandedElts, SrcZero, SrcUndef;
+ APInt SrcDemandedElts, SrcDoNotPoisonEltMask, SrcZero, SrcUndef;
// Bitcast from 'large element' src vector to 'small element' vector, we
// must demand a source element if any DemandedElt maps to it.
if ((NumElts % NumSrcElts) == 0) {
unsigned Scale = NumElts / NumSrcElts;
SrcDemandedElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
- if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcUndef, SrcZero,
+ SrcDoNotPoisonEltMask =
+ APIntOps::ScaleBitMask(DoNotPoisonEltMask, NumSrcElts);
+ if (SimplifyDemandedVectorElts(Src, SrcDemandedElts,
+ SrcDoNotPoisonEltMask, SrcUndef, SrcZero,
TLO, Depth + 1))
return true;
@@ -3284,7 +3313,8 @@ bool TargetLowering::SimplifyDemandedVectorElts(
}
KnownBits Known;
- if (SimplifyDemandedBits(Src, SrcDemandedBits, SrcDemandedElts, Known,
+ if (SimplifyDemandedBits(Src, SrcDemandedBits,
+ SrcDemandedElts | SrcDoNotPoisonEltMask, Known,
TLO, Depth + 1))
return true;
@@ -3322,7 +3352,10 @@ bool TargetLowering::SimplifyDemandedVectorElts(
if ((NumSrcElts % NumElts) == 0) {
unsigned Scale = NumSrcElts / NumElts;
SrcDemandedElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
- if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcUndef, SrcZero,
+ SrcDoNotPoisonEltMask =
+ APIntOps::ScaleBitMask(DoNotPoisonEltMask, NumSrcElts);
+ if (SimplifyDemandedVectorElts(Src, SrcDemandedElts,
+ SrcDoNotPoisonEltMask, SrcUndef, SrcZero,
TLO, Depth + 1))
return true;
@@ -3341,7 +3374,8 @@ bool TargetLowering::SimplifyDemandedVectorElts(
}
case ISD::FREEZE: {
SDValue N0 = Op.getOperand(0);
- if (TLO.DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts,
+ if (TLO.DAG.isGuaranteedNotToBeUndefOrPoison(N0,
+ DemandedEltsInclDoNotPoison,
/*PoisonOnly=*/false,
Depth + 1))
return TLO.CombineTo(Op, N0);
@@ -3391,9 +3425,11 @@ bool TargetLowering::SimplifyDemandedVectorElts(
for (unsigned i = 0; i != NumSubVecs; ++i) {
SDValue SubOp = Op.getOperand(i);
APInt SubElts = DemandedElts.extractBits(NumSubElts, i * NumSubElts);
+ APInt DoNotPoisonSubElts =
+ DoNotPoisonEltMask.extractBits(NumSubElts, i * NumSubElts);
APInt SubUndef, SubZero;
- if (SimplifyDemandedVectorElts(SubOp, SubElts, SubUndef, SubZero, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(SubOp, SubElts, DoNotPoisonSubElts,
+ SubUndef, SubZero, TLO, Depth + 1))
return true;
KnownUndef.insertBits(SubUndef, i * NumSubElts);
KnownZero.insertBits(SubZero, i * NumSubElts);
@@ -3406,10 +3442,19 @@ bool TargetLowering::SimplifyDemandedVectorElts(
for (unsigned i = 0; i != NumSubVecs; ++i) {
SDValue SubOp = Op.getOperand(i);
APInt SubElts = DemandedElts.extractBits(NumSubElts, i * NumSubElts);
- SDValue NewSubOp = SimplifyMultipleUseDemandedVectorElts(
- SubOp, SubElts, TLO.DAG, Depth + 1);
- DemandedSubOps.push_back(NewSubOp ? NewSubOp : SubOp);
- FoundNewSub = NewSubOp ? true : FoundNewSub;
+ if (SubElts != 0) {
+ APInt SubEltsInclDoNotPoison =
+ DemandedEltsInclDoNotPoison.extractBits(NumSubElts,
+ i * NumSubElts);
+ SDValue NewSubOp = SimplifyMultipleUseDemandedVectorElts(
+ SubOp, SubEltsInclDoNotPoison, TLO.DAG, Depth + 1);
+ DemandedSubOps.push_back(NewSubOp ? NewSubOp : SubOp);
+ FoundNewSub = NewSubOp ? true : FoundNewSub;
+ } else if (!SubOp.isUndef()) {
+ DemandedSubOps.push_back(TLO.DAG.getUNDEF(SubOp.getValueType()));
+ FoundNewSub = true;
+ } else
+ DemandedSubOps.push_back(SubOp);
}
if (FoundNewSub) {
SDValue NewOp =
@@ -3425,18 +3470,28 @@ bool TargetLowering::SimplifyDemandedVectorElts(
SDValue Src = Op.getOperand(0);
SDValue Sub = Op.getOperand(1);
uint64_t Idx = Op.getConstantOperandVal(2);
- unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
+ EVT SubVT = Sub.getValueType();
+ unsigned NumSubElts = SubVT.getVectorNumElements();
APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
+ APInt DoNoPoisonSubElts = DoNotPoisonEltMask.extractBits(NumSubElts, Idx);
APInt DemandedSrcElts = DemandedElts;
DemandedSrcElts.clearBits(Idx, Idx + NumSubElts);
+ APInt DoNoPoisonSrcElts = DoNotPoisonEltMask;
+ DoNoPoisonSrcElts.clearBits(Idx, Idx + NumSubElts);
- // If none of the sub operand elements are demanded, bypass the insert.
- if (!DemandedSubElts)
+ // If none of the sub operand elements are demanded and may be poisoned,
+ // bypass the insert.
+ if (!DemandedSubElts && !DoNoPoisonSubElts)
return TLO.CombineTo(Op, Src);
+ // If none of the sub operand elements are demanded, replace it with undef.
+ if (!DemandedSubElts && !Sub.isUndef())
+ return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+ Src, TLO.DAG.getUNDEF(SubVT),
+ Op.getOperand(2)));
APInt SubUndef, SubZero;
- if (SimplifyDemandedVectorElts(Sub, DemandedSubElts, SubUndef, SubZero, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(Sub, DemandedSubElts, DoNoPoisonSubElts,
+ SubUndef, SubZero, TLO, Depth + 1))
return true;
// If none of the src operand elements are demanded, replace it with undef.
@@ -3445,8 +3500,8 @@ bool TargetLowering::SimplifyDemandedVectorElts(
TLO.DAG.getUNDEF(VT), Sub,
Op.getOperand(2)));
- if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, KnownUndef, KnownZero,
- TLO, Depth + 1))
+ if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, DoNoPoisonSrcElts,
+ KnownUndef, KnownZero, TLO, Depth + 1))
return true;
KnownUndef.insertBits(SubUndef, Idx);
KnownZero.insertBits(SubZero, Idx);
@@ -3454,9 +3509,9 @@ bool TargetLowering::SimplifyDemandedVectorElts(
// Attempt to avoid multi-use ops if we don't need anything from them.
if (!DemandedSrcElts.isAllOnes() || !DemandedSubElts.isAllOnes()) {
SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
- Src, DemandedSrcElts, TLO.DAG, Depth + 1);
+ Src, DemandedSrcElts | DoNoPoisonSrcElts, TLO.DAG, Depth + 1);
SDValue NewSub = SimplifyMultipleUseDemandedVectorElts(
- Sub, DemandedSubElts, TLO.DAG, Depth + 1);
+ Sub, DemandedSubElts | DoNoPoisonSubElts, TLO.DAG, Depth + 1);
if (NewSrc || NewSub) {
NewSrc = NewSrc ? NewSrc : Src;
NewSub = NewSub ? NewSub : Sub;
@@ -3475,10 +3530,15 @@ bool TargetLowering::SimplifyDemandedVectorElts(
uint64_t Idx = Op.getConstantOperandVal(1);
unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts).shl(Idx);
+ APInt DoNotPoisonDemandedSrcElts =
+ DoNotPoisonEltMask.zext(NumSrcElts).shl(Idx);
+ APInt DemandedSrcEltsInclDoNotPoison =
+ DemandedEltsInclDoNotPoison.zext(NumSrcElts).shl(Idx);
APInt SrcUndef, SrcZero;
- if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, SrcUndef, SrcZero, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(Src, DemandedSrcElts,
+ DoNotPoisonDemandedSrcElts, SrcUndef,
+ SrcZero, TLO, Depth + 1))
return true;
KnownUndef = SrcUndef.extractBits(NumElts, Idx);
KnownZero = SrcZero.extractBits(NumElts, Idx);
@@ -3486,7 +3546,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
// Attempt to avoid multi-use ops if we don't need anything from them.
if (!DemandedElts.isAllOnes()) {
SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
- Src, DemandedSrcElts, TLO.DAG, Depth + 1);
+ Src, DemandedSrcEltsInclDoNotPoison, TLO.DAG, Depth + 1);
if (NewSrc) {
SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, NewSrc,
Op.getOperand(1));
@@ -3498,18 +3558,26 @@ bool TargetLowering::SimplifyDemandedVectorElts(
case ISD::INSERT_VECTOR_ELT: {
SDValue Vec = Op.getOperand(0);
SDValue Scl = Op.getOperand(1);
+ EVT SclVT = Scl.getValueType();
auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
// For a legal, constant insertion index, if we don't need this insertion
// then strip it, else remove it from the demanded elts.
if (CIdx && CIdx->getAPIntValue().ult(NumElts)) {
unsigned Idx = CIdx->getZExtValue();
- if (!DemandedElts[Idx])
+ if (!DemandedEltsInclDoNotPoison[Idx])
return TLO.CombineTo(Op, Vec);
+ if (!DemandedElts[Idx] && !Scl.isUndef())
+ return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
+ Vec, TLO.DAG.getUNDEF(SclVT),
+ Op.getOperand(2)));
APInt DemandedVecElts(DemandedElts);
DemandedVecElts.clearBit(Idx);
- if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
+ APInt SrcDoNotPoisonEltMask(DoNotPoisonEltMask);
+ SrcDoNotPoisonEltMask.clearBit(Idx);
+ if (SimplifyDemandedVectorElts(Vec, DemandedVecElts,
+ SrcDoNotPoisonEltMask, KnownUndef,
KnownZero, TLO, Depth + 1))
return true;
@@ -3520,8 +3588,8 @@ bool TargetLowering::SimplifyDemandedVectorElts(
}
APInt VecUndef, VecZero;
- if (SimplifyDemandedVectorElts(Vec, DemandedElts, VecUndef, VecZero, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(Vec, DemandedElts, DoNotPoisonEltMask,
+ VecUndef, VecZero, TLO, Depth + 1))
return true;
// Without knowing the insertion index we can't set KnownUndef/KnownZero.
break;
@@ -3534,20 +3602,18 @@ bool TargetLowering::SimplifyDemandedVectorElts(
// Try to transform the select condition based on the current demanded
// elements.
APInt UndefSel, ZeroSel;
- if (SimplifyDemandedVectorElts(Sel, DemandedElts, UndefSel, ZeroSel, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(Sel, DemandedElts, DoNotPoisonEltMask,
+ UndefSel, ZeroSel, TLO, Depth + 1))
return true;
// See if we can simplify either vselect operand.
- APInt DemandedLHS(DemandedElts);
- APInt DemandedRHS(DemandedElts);
APInt UndefLHS, ZeroLHS;
APInt UndefRHS, ZeroRHS;
- if (SimplifyDemandedVectorElts(LHS, DemandedLHS, UndefLHS, ZeroLHS, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(LHS, DemandedElts, DoNotPoisonEltMask,
+ UndefLHS, ZeroLHS, TLO, Depth + 1))
return true;
- if (SimplifyDemandedVectorElts(RHS, DemandedRHS, UndefRHS, ZeroRHS, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(RHS, DemandedElts, DoNotPoisonEltMask,
+ UndefRHS, ZeroRHS, TLO, Depth + 1))
return true;
KnownUndef = UndefLHS & UndefRHS;
@@ -3557,8 +3623,8 @@ bool TargetLowering::SimplifyDemandedVectorElts(
// select value element.
APInt DemandedSel = DemandedElts & ~KnownZero;
if (DemandedSel != DemandedElts)
- if (SimplifyDemandedVectorElts(Sel, DemandedSel, UndefSel, ZeroSel, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(Sel, DemandedSel, DoNotPoisonEltMask,
+ UndefSel, ZeroSel, TLO, Depth + 1))
return true;
break;
@@ -3568,19 +3634,16 @@ bool TargetLowering::SimplifyDemandedVectorElts(
SDValue RHS = Op.getOperand(1);
ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(Op)->getMask();
- // Collect demanded elements from shuffle operands..
- APInt DemandedLHS(NumElts, 0);
- APInt DemandedRHS(NumElts, 0);
- for (unsigned i = 0; i != NumElts; ++i) {
- int M = ShuffleMask[i];
- if (M < 0 || !DemandedElts[i])
- continue;
- assert(0 <= M && M < (int)(2 * NumElts) && "Shuffle index out of range");
- if (M < (int)NumElts)
- DemandedLHS.setBit(M);
- else
- DemandedRHS.setBit(M - NumElts);
- }
+ // Collect demanded elements from shuffle operands.
+ APInt DemandedLHS, DemandedRHS;
+ APInt DoNotPoisonLHS, DoNotPoisonRHS;
+ if (!getShuffleDemandedElts(NumElts, ShuffleMask, DemandedElts, DemandedLHS,
+ DemandedRHS,
+ /*AllowUndefElts=*/true) ||
+ !getShuffleDemandedElts(NumElts, ShuffleMask, DoNotPoisonEltMask,
+ DoNotPoisonLHS, DoNotPoisonRHS,
+ /*AllowUndefElts=*/true))
+ break;
// If either side isn't demanded, replace it by UNDEF. We handle this
// explicitly here to also simplify in case of multiple uses (on the
@@ -3598,11 +3661,11 @@ bool TargetLowering::SimplifyDemandedVectorElts(
// See if we can simplify either shuffle operand.
APInt UndefLHS, ZeroLHS;
APInt UndefRHS, ZeroRHS;
- if (SimplifyDemandedVectorElts(LHS, DemandedLHS, UndefLHS, ZeroLHS, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(LHS, DemandedLHS, DoNotPoisonLHS, UndefLHS,
+ ZeroLHS, TLO, Depth + 1))
return true;
- if (SimplifyDemandedVectorElts(RHS, DemandedRHS, UndefRHS, ZeroRHS, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(RHS, DemandedRHS, DoNotPoisonRHS, UndefRHS,
+ ZeroRHS, TLO, Depth + 1))
return true;
// Simplify mask using undef elements from LHS/RHS.
@@ -3656,9 +3719,10 @@ bool TargetLowering::SimplifyDemandedVectorElts(
APInt SrcUndef, SrcZero;
SDValue Src = Op.getOperand(0);
unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
- APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts);
- if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, SrcUndef, SrcZero, TLO,
- Depth + 1))
+ APInt DemandedSrcElts = DemandedEltsInclDoNotPoison.zext(NumSrcElts);
+ APInt SrcDoNotPoisonEltMask = APInt::getZero(NumSrcElts);
+ if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, SrcDoNotPoisonEltMask,
+ SrcUndef, SrcZero, TLO, Depth + 1))
return true;
KnownZero = SrcZero.zextOrTrunc(NumElts);
KnownUndef = SrcUndef.zextOrTrunc(NumElts);
@@ -3705,8 +3769,9 @@ bool TargetLowering::SimplifyDemandedVectorElts(
SDValue Op1 = Op.getOperand(1);
if (Op0 == Op1 && Op->isOnlyUserOf(Op0.getNode())) {
APInt UndefLHS, ZeroLHS;
- if (SimplifyDemandedVectorElts(Op0, DemandedElts, UndefLHS, ZeroLHS, TLO,
- Depth + 1, /*AssumeSingleUse*/ true))
+ if (SimplifyDemandedVectorElts(Op0, DemandedElts, DoNotPoisonEltMask,
+ UndefLHS, ZeroLHS, TLO, Depth + 1,
+ /*AssumeSingleUse*/ true))
return true;
}
[[fallthrough]];
@@ -3727,12 +3792,12 @@ bool TargetLowering::SimplifyDemandedVectorElts(
SDValue Op1 = Op.getOperand(1);
APInt UndefRHS, ZeroRHS;
- if (SimplifyDemandedVectorElts(Op1, DemandedElts, UndefRHS, ZeroRHS, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(Op1, DemandedElts, DoNotPoisonEltMask,
+ UndefRHS, ZeroRHS, TLO, Depth + 1))
return true;
APInt UndefLHS, ZeroLHS;
- if (SimplifyDemandedVectorElts(Op0, DemandedElts, UndefLHS, ZeroLHS, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(Op0, DemandedElts, DoNotPoisonEltMask,
+ UndefLHS, ZeroLHS, TLO, Depth + 1))
return true;
KnownZero = ZeroLHS & ZeroRHS;
@@ -3740,7 +3805,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
// Attempt to avoid multi-use ops if we don't need anything from them.
// TODO - use KnownUndef to relax the demandedelts?
- if (!DemandedElts.isAllOnes())
+ if (!DemandedEltsInclDoNotPoison.isAllOnes())
if (SimplifyDemandedVectorEltsBinOp(Op0, Op1))
return true;
break;
@@ -3754,12 +3819,12 @@ bool TargetLowering::SimplifyDemandedVectorElts(
SDValue Op1 = Op.getOperand(1);
APInt UndefRHS, ZeroRHS;
- if (SimplifyDemandedVectorElts(Op1, DemandedElts, UndefRHS, ZeroRHS, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(Op1, DemandedElts, DoNotPoisonEltMask,
+ UndefRHS, ZeroRHS, TLO, Depth + 1))
return true;
APInt UndefLHS, ZeroLHS;
- if (SimplifyDemandedVectorElts(Op0, DemandedElts, UndefLHS, ZeroLHS, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(Op0, DemandedElts, DoNotPoisonEltMask,
+ UndefLHS, ZeroLHS, TLO, Depth + 1))
return true;
KnownZero = ZeroLHS;
@@ -3767,7 +3832,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
// Attempt to avoid multi-use ops if we don't need anything from them.
// TODO - use KnownUndef to relax the demandedelts?
- if (!DemandedElts.isAllOnes())
+ if (!DemandedEltsInclDoNotPoison.isAllOnes())
if (SimplifyDemandedVectorEltsBinOp(Op0, Op1))
return true;
break;
@@ -3780,14 +3845,14 @@ bool TargetLowering::SimplifyDemandedVectorElts(
SDValue Op1 = Op.getOperand(1);
APInt SrcUndef, SrcZero;
- if (SimplifyDemandedVectorElts(Op1, DemandedElts, SrcUndef, SrcZero, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(Op1, DemandedElts, DoNotPoisonEltMask,
+ SrcUndef, SrcZero, TLO, Depth + 1))
return true;
// If we know that a demanded element was zero in Op1 we don't need to
// demand it in Op0 - its guaranteed to be zero.
APInt DemandedElts0 = DemandedElts & ~SrcZero;
- if (SimplifyDemandedVectorElts(Op0, DemandedElts0, KnownUndef, KnownZero,
- TLO, Depth + 1))
+ if (SimplifyDemandedVectorElts(Op0, DemandedElts0, DoNotPoisonEltMask,
+ KnownUndef, KnownZero, TLO, Depth + 1))
return true;
KnownUndef &= DemandedElts0;
@@ -3808,7 +3873,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
KnownUndef &= ~KnownZero;
// Attempt to avoid multi-use ops if we don't need anything from them.
- if (!DemandedElts.isAllOnes())
+ if (!DemandedEltsInclDoNotPoison.isAllOnes())
if (SimplifyDemandedVectorEltsBinOp(Op0, Op1))
return true;
break;
@@ -3816,13 +3881,15 @@ bool TargetLowering::SimplifyDemandedVectorElts(
case ISD::TRUNCATE:
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
- if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, KnownUndef,
- KnownZero, TLO, Depth + 1))
+ if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts,
+ DoNotPoisonEltMask, KnownUndef, KnownZero,
+ TLO, Depth + 1))
return true;
- if (!DemandedElts.isAllOnes())
+ if (!DemandedEltsInclDoNotPoison.isAllOnes())
if (SDValue NewOp = SimplifyMultipleUseDemandedVectorElts(
- Op.getOperand(0), DemandedElts, TLO.DAG, Depth + 1))
+ Op.getOperand(0), DemandedEltsInclDoNotPoison, TLO.DAG,
+ Depth + 1))
return TLO.CombineTo(Op, TLO.DAG.getNode(Opcode, SDLoc(Op), VT, NewOp));
if (Op.getOpcode() == ISD::ZERO_EXTEND) {
@@ -3836,20 +3903,23 @@ bool TargetLowering::SimplifyDemandedVectorElts(
case ISD::UINT_TO_FP:
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
- if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, KnownUndef,
- KnownZero, TLO, Depth + 1))
+ if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts,
+ DoNotPoisonEltMask, KnownUndef, KnownZero,
+ TLO, Depth + 1))
return true;
// Don't fall through to generic undef -> undef handling.
return false;
default: {
if (Op.getOpcode() >= ISD::BUILTIN_OP_END) {
- if (SimplifyDemandedVectorEltsForTargetNode(Op, DemandedElts, KnownUndef,
- KnownZero, TLO, Depth))
+ if (SimplifyDemandedVectorEltsForTargetNode(
+ Op, DemandedElts | DoNotPoisonEltMask, KnownUndef, KnownZero, TLO,
+ Depth))
return true;
} else {
KnownBits Known;
APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
- if (SimplifyDemandedBits(Op, DemandedBits, OriginalDemandedElts, Known,
+ if (SimplifyDemandedBits(Op, DemandedBits,
+ OriginalDemandedElts | DoNotPoisonEltMask, Known,
TLO, Depth, AssumeSingleUse))
return true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll
index a74dc622f6a89..fa6d878ad7556 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll
@@ -188,29 +188,33 @@ define amdgpu_ps ptr addrspace(8) @variable_top_half(ptr inreg %p, i64 inreg %nu
; CHECK45-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 25
; CHECK45-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 killed [[COPY5]], killed [[S_MOV_B32_]], implicit-def dead $scc
; CHECK45-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; CHECK45-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, killed [[S_LSHL_B32_]], %subreg.sub1
+ ; CHECK45-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_LSHL_B32_]], %subreg.sub1
; CHECK45-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[REG_SEQUENCE]], killed [[REG_SEQUENCE2]], implicit-def dead $scc
; CHECK45-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1
; CHECK45-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 28
; CHECK45-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY]], killed [[S_MOV_B32_2]], implicit-def dead $scc
- ; CHECK45-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, killed [[S_LSHL_B32_1]], %subreg.sub1
+ ; CHECK45-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK45-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK45-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[DEF]], %subreg.sub0, killed [[S_LSHL_B32_1]], %subreg.sub1
; CHECK45-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 7
; CHECK45-NEXT: [[S_LSHR_B64_:%[0-9]+]]:sreg_64 = S_LSHR_B64 [[REG_SEQUENCE1]], killed [[S_MOV_B32_3]], implicit-def dead $scc
; CHECK45-NEXT: [[S_OR_B64_1:%[0-9]+]]:sreg_64 = S_OR_B64 [[S_LSHR_B64_]], killed [[REG_SEQUENCE3]], implicit-def dead $scc
- ; CHECK45-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 70368744177664
- ; CHECK45-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY killed [[S_MOV_B]]
- ; CHECK45-NEXT: [[S_OR_B64_2:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[S_OR_B64_1]], killed [[COPY7]], implicit-def dead $scc
- ; CHECK45-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_2]].sub1
- ; CHECK45-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
- ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec
- ; CHECK45-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
- ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
- ; CHECK45-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
- ; CHECK45-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
- ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
- ; CHECK45-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[S_LSHR_B64_]].sub0
- ; CHECK45-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY13]]
- ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
+ ; CHECK45-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 16384
+ ; CHECK45-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK45-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK45-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[DEF2]], %subreg.sub0, killed [[S_MOV_B32_4]], %subreg.sub1
+ ; CHECK45-NEXT: [[S_OR_B64_2:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[S_OR_B64_1]], killed [[REG_SEQUENCE4]], implicit-def dead $scc
+ ; CHECK45-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_2]].sub1
+ ; CHECK45-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+ ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY8]], implicit $exec
+ ; CHECK45-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec
+ ; CHECK45-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; CHECK45-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
+ ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec
+ ; CHECK45-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[S_LSHR_B64_]].sub0
+ ; CHECK45-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
+ ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
; CHECK45-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]]
; CHECK45-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; CHECK45-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_3]]
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 235ad10b282bd..83c240c17ff1c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -9668,15 +9668,13 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX1250-NEXT: s_lshr_b32 s4, s3, 31
; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1
; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1001e
-; GFX1250-NEXT: s_bfe_u32 s6, s2, 0x10006
-; GFX1250-NEXT: v_mov_b32_e32 v7, v1
-; GFX1250-NEXT: s_bfe_u32 s7, s2, 0x10004
+; GFX1250-NEXT: s_bfe_u32 s6, s2, 0x10004
+; GFX1250-NEXT: s_and_b32 s7, s2, 1
; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:416
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1001d
; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1001c
-; GFX1250-NEXT: s_and_b32 s8, s2, 1
; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:496
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
@@ -9801,19 +9799,20 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10007
-; GFX1250-NEXT: s_mov_b32 s5, 0
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10006
+; GFX1250-NEXT: v_mov_b32_e32 v7, v1
; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
; GFX1250-NEXT: s_mov_b32 s4, s3
-; GFX1250-NEXT: s_bfe_u32 s6, s3, 0x10001
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10001
; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10005
-; GFX1250-NEXT: v_mov_b32_e32 v6, s6
+; GFX1250-NEXT: v_mov_b32_e32 v6, s5
; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v2, s3
; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10003
-; GFX1250-NEXT: s_bfe_u32 s7, s2, 0x10001
+; GFX1250-NEXT: s_bfe_u32 s6, s2, 0x10001
; GFX1250-NEXT: s_bfe_u32 s2, s2, 0x10002
; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
; GFX1250-NEXT: s_wait_xcnt 0x0
@@ -9823,7 +9822,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX1250-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v2, s7
+; GFX1250-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v2, s6
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:256
; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1]
@@ -9841,363 +9840,329 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-NEXT: s_mov_b32 s7, 0
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_ashr_i32 s6, s5, 31
-; GFX6-NEXT: s_bfe_i64 s[66:67], s[4:5], 0x10000
; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: s_mov_b32 s69, s7
-; GFX6-NEXT: s_mov_b32 s27, s7
-; GFX6-NEXT: s_mov_b32 s37, s7
-; GFX6-NEXT: s_mov_b32 s31, s7
-; GFX6-NEXT: s_mov_b32 s29, s7
-; GFX6-NEXT: s_mov_b32 s23, s7
-; GFX6-NEXT: s_mov_b32 s35, s7
-; GFX6-NEXT: s_mov_b32 s25, s7
-; GFX6-NEXT: s_mov_b32 s19, s7
-; GFX6-NEXT: s_mov_b32 s21, s7
-; GFX6-NEXT: s_mov_b32 s9, s7
-; GFX6-NEXT: s_mov_b32 s11, s7
-; GFX6-NEXT: s_mov_b32 s13, s7
-; GFX6-NEXT: s_mov_b32 s15, s7
-; GFX6-NEXT: s_mov_b32 s17, s7
-; GFX6-NEXT: s_mov_b32 s39, s7
-; GFX6-NEXT: s_mov_b32 s41, s7
-; GFX6-NEXT: s_mov_b32 s43, s7
-; GFX6-NEXT: s_mov_b32 s45, s7
-; GFX6-NEXT: s_mov_b32 s47, s7
-; GFX6-NEXT: s_mov_b32 s49, s7
-; GFX6-NEXT: s_mov_b32 s51, s7
-; GFX6-NEXT: s_mov_b32 s53, s7
-; GFX6-NEXT: s_mov_b32 s55, s7
-; GFX6-NEXT: s_mov_b32 s57, s7
-; GFX6-NEXT: s_mov_b32 s59, s7
-; GFX6-NEXT: s_mov_b32 s61, s7
-; GFX6-NEXT: s_mov_b32 s63, s7
-; GFX6-NEXT: v_mov_b32_e32 v6, s6
-; GFX6-NEXT: s_mov_b32 s65, s7
-; GFX6-NEXT: v_mov_b32_e32 v2, s66
-; GFX6-NEXT: v_mov_b32_e32 v3, s67
-; GFX6-NEXT: s_mov_b32 s67, s7
-; GFX6-NEXT: s_lshr_b32 s6, s5, 30
-; GFX6-NEXT: s_mov_b32 s68, s5
-; GFX6-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX6-NEXT: v_mov_b32_e32 v0, s68
-; GFX6-NEXT: v_mov_b32_e32 v1, s69
-; GFX6-NEXT: s_lshr_b32 s8, s4, 30
-; GFX6-NEXT: v_mov_b32_e32 v4, s6
-; GFX6-NEXT: v_mov_b32_e32 v5, s7
-; GFX6-NEXT: s_lshr_b32 s6, s4, 31
-; GFX6-NEXT: s_bfe_i64 s[68:69], s[8:9], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX6-NEXT: v_mov_b32_e32 v8, s68
-; GFX6-NEXT: v_mov_b32_e32 v9, s69
-; GFX6-NEXT: s_lshr_b32 s8, s4, 28
-; GFX6-NEXT: v_mov_b32_e32 v10, s6
-; GFX6-NEXT: v_mov_b32_e32 v11, s7
-; GFX6-NEXT: s_lshr_b32 s6, s4, 29
-; GFX6-NEXT: s_bfe_i64 s[68:69], s[8:9], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX6-NEXT: v_mov_b32_e32 v12, s68
-; GFX6-NEXT: v_mov_b32_e32 v13, s69
-; GFX6-NEXT: s_lshr_b32 s8, s4, 26
-; GFX6-NEXT: v_mov_b32_e32 v14, s6
-; GFX6-NEXT: v_mov_b32_e32 v15, s7
-; GFX6-NEXT: s_lshr_b32 s6, s4, 27
-; GFX6-NEXT: s_bfe_i64 s[68:69], s[8:9], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX6-NEXT: v_mov_b32_e32 v16, s68
-; GFX6-NEXT: v_mov_b32_e32 v17, s69
-; GFX6-NEXT: s_lshr_b32 s8, s4, 24
-; GFX6-NEXT: v_mov_b32_e32 v18, s6
-; GFX6-NEXT: v_mov_b32_e32 v19, s7
-; GFX6-NEXT: s_lshr_b32 s6, s4, 25
-; GFX6-NEXT: s_bfe_i64 s[68:69], s[8:9], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX6-NEXT: v_mov_b32_e32 v20, s68
-; GFX6-NEXT: v_mov_b32_e32 v21, s69
-; GFX6-NEXT: s_lshr_b32 s8, s4, 22
-; GFX6-NEXT: v_mov_b32_e32 v22, s6
-; GFX6-NEXT: v_mov_b32_e32 v23, s7
-; GFX6-NEXT: s_lshr_b32 s6, s4, 23
-; GFX6-NEXT: s_bfe_i64 s[68:69], s[8:9], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX6-NEXT: v_mov_b32_e32 v24, s68
-; GFX6-NEXT: v_mov_b32_e32 v25, s69
-; GFX6-NEXT: s_lshr_b32 s8, s4, 20
-; GFX6-NEXT: v_mov_b32_e32 v26, s6
-; GFX6-NEXT: v_mov_b32_e32 v27, s7
-; GFX6-NEXT: s_lshr_b32 s6, s4, 21
-; GFX6-NEXT: v_mov_b32_e32 v7, v6
-; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[68:69], s[8:9], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:496
-; GFX6-NEXT: v_mov_b32_e32 v28, s68
-; GFX6-NEXT: v_mov_b32_e32 v29, s69
-; GFX6-NEXT: s_lshr_b32 s8, s4, 18
-; GFX6-NEXT: v_mov_b32_e32 v30, s6
-; GFX6-NEXT: v_mov_b32_e32 v31, s7
-; GFX6-NEXT: s_lshr_b32 s6, s4, 19
-; GFX6-NEXT: s_bfe_i64 s[68:69], s[8:9], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:240
-; GFX6-NEXT: v_mov_b32_e32 v32, s68
-; GFX6-NEXT: v_mov_b32_e32 v33, s69
-; GFX6-NEXT: s_lshr_b32 s8, s4, 16
-; GFX6-NEXT: v_mov_b32_e32 v34, s6
-; GFX6-NEXT: v_mov_b32_e32 v35, s7
-; GFX6-NEXT: s_lshr_b32 s6, s4, 17
-; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[68:69], s[8:9], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:224
-; GFX6-NEXT: v_mov_b32_e32 v36, s68
-; GFX6-NEXT: v_mov_b32_e32 v37, s69
-; GFX6-NEXT: s_lshr_b32 s8, s4, 14
-; GFX6-NEXT: v_mov_b32_e32 v38, s6
-; GFX6-NEXT: v_mov_b32_e32 v39, s7
-; GFX6-NEXT: s_lshr_b32 s6, s4, 15
-; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[68:69], s[8:9], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:208
-; GFX6-NEXT: s_waitcnt expcnt(3)
-; GFX6-NEXT: v_mov_b32_e32 v4, s68
-; GFX6-NEXT: v_mov_b32_e32 v5, s69
-; GFX6-NEXT: s_lshr_b32 s8, s4, 12
-; GFX6-NEXT: v_mov_b32_e32 v6, s6
-; GFX6-NEXT: v_mov_b32_e32 v7, s7
-; GFX6-NEXT: s_lshr_b32 s6, s4, 13
-; GFX6-NEXT: s_bfe_i64 s[68:69], s[8:9], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:192
-; GFX6-NEXT: s_waitcnt expcnt(3)
-; GFX6-NEXT: v_mov_b32_e32 v8, s68
-; GFX6-NEXT: v_mov_b32_e32 v9, s69
-; GFX6-NEXT: s_lshr_b32 s8, s4, 10
-; GFX6-NEXT: v_mov_b32_e32 v10, s6
-; GFX6-NEXT: v_mov_b32_e32 v11, s7
-; GFX6-NEXT: s_lshr_b32 s6, s4, 11
-; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[68:69], s[8:9], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:176
-; GFX6-NEXT: s_waitcnt expcnt(3)
-; GFX6-NEXT: v_mov_b32_e32 v12, s68
-; GFX6-NEXT: v_mov_b32_e32 v13, s69
-; GFX6-NEXT: s_lshr_b32 s8, s4, 8
-; GFX6-NEXT: v_mov_b32_e32 v14, s6
-; GFX6-NEXT: v_mov_b32_e32 v15, s7
-; GFX6-NEXT: s_lshr_b32 s6, s4, 9
-; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[68:69], s[8:9], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:160
-; GFX6-NEXT: s_waitcnt expcnt(3)
-; GFX6-NEXT: v_mov_b32_e32 v16, s68
-; GFX6-NEXT: v_mov_b32_e32 v17, s69
-; GFX6-NEXT: s_lshr_b32 s8, s4, 6
-; GFX6-NEXT: v_mov_b32_e32 v18, s6
-; GFX6-NEXT: v_mov_b32_e32 v19, s7
-; GFX6-NEXT: s_lshr_b32 s6, s4, 7
-; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[68:69], s[8:9], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:144
-; GFX6-NEXT: s_waitcnt expcnt(3)
-; GFX6-NEXT: v_mov_b32_e32 v20, s68
-; GFX6-NEXT: v_mov_b32_e32 v21, s69
-; GFX6-NEXT: s_lshr_b32 s8, s4, 4
-; GFX6-NEXT: v_mov_b32_e32 v22, s6
-; GFX6-NEXT: v_mov_b32_e32 v23, s7
-; GFX6-NEXT: s_lshr_b32 s6, s4, 5
-; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[68:69], s[8:9], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:128
-; GFX6-NEXT: s_waitcnt expcnt(3)
-; GFX6-NEXT: v_mov_b32_e32 v24, s68
-; GFX6-NEXT: v_mov_b32_e32 v25, s69
-; GFX6-NEXT: s_lshr_b32 s68, s4, 2
-; GFX6-NEXT: v_mov_b32_e32 v26, s6
-; GFX6-NEXT: v_mov_b32_e32 v27, s7
-; GFX6-NEXT: s_lshr_b32 s6, s4, 3
-; GFX6-NEXT: s_lshr_b32 s26, s5, 29
-; GFX6-NEXT: s_lshr_b32 s36, s5, 28
-; GFX6-NEXT: s_lshr_b32 s30, s5, 26
-; GFX6-NEXT: s_lshr_b32 s28, s5, 27
-; GFX6-NEXT: s_lshr_b32 s22, s5, 25
-; GFX6-NEXT: s_lshr_b32 s34, s5, 24
-; GFX6-NEXT: s_lshr_b32 s24, s5, 22
-; GFX6-NEXT: s_lshr_b32 s18, s5, 23
-; GFX6-NEXT: s_lshr_b32 s20, s5, 20
-; GFX6-NEXT: s_lshr_b32 s8, s5, 21
-; GFX6-NEXT: s_lshr_b32 s10, s5, 18
-; GFX6-NEXT: s_lshr_b32 s12, s5, 19
-; GFX6-NEXT: s_lshr_b32 s14, s5, 17
-; GFX6-NEXT: s_lshr_b32 s16, s5, 16
-; GFX6-NEXT: s_lshr_b32 s38, s5, 14
-; GFX6-NEXT: s_lshr_b32 s40, s5, 15
-; GFX6-NEXT: s_lshr_b32 s42, s5, 12
-; GFX6-NEXT: s_lshr_b32 s44, s5, 13
-; GFX6-NEXT: s_lshr_b32 s46, s5, 10
-; GFX6-NEXT: s_lshr_b32 s48, s5, 11
-; GFX6-NEXT: s_lshr_b32 s50, s5, 8
-; GFX6-NEXT: s_lshr_b32 s52, s5, 9
-; GFX6-NEXT: s_lshr_b32 s54, s5, 6
-; GFX6-NEXT: s_lshr_b32 s56, s5, 7
-; GFX6-NEXT: s_lshr_b32 s58, s5, 4
-; GFX6-NEXT: s_lshr_b32 s60, s5, 5
-; GFX6-NEXT: s_lshr_b32 s62, s5, 2
-; GFX6-NEXT: s_lshr_b32 s64, s5, 3
-; GFX6-NEXT: s_lshr_b32 s66, s5, 1
-; GFX6-NEXT: s_lshr_b32 s4, s4, 1
-; GFX6-NEXT: s_bfe_i64 s[70:71], s[6:7], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_lshr_b32 s42, s5, 30
+; GFX6-NEXT: s_lshr_b32 s36, s4, 30
+; GFX6-NEXT: s_lshr_b32 s38, s4, 31
+; GFX6-NEXT: s_lshr_b32 s30, s4, 28
+; GFX6-NEXT: s_lshr_b32 s34, s4, 29
+; GFX6-NEXT: s_lshr_b32 s26, s4, 26
+; GFX6-NEXT: s_lshr_b32 s28, s4, 27
+; GFX6-NEXT: s_lshr_b32 s22, s4, 24
+; GFX6-NEXT: s_lshr_b32 s24, s4, 25
+; GFX6-NEXT: s_lshr_b32 s18, s4, 22
+; GFX6-NEXT: s_lshr_b32 s20, s4, 23
+; GFX6-NEXT: s_lshr_b32 s14, s4, 20
+; GFX6-NEXT: s_lshr_b32 s16, s4, 21
+; GFX6-NEXT: s_lshr_b32 s10, s4, 18
+; GFX6-NEXT: s_lshr_b32 s12, s4, 19
+; GFX6-NEXT: s_lshr_b32 s6, s4, 16
+; GFX6-NEXT: s_lshr_b32 s8, s4, 17
+; GFX6-NEXT: s_ashr_i32 s7, s5, 31
+; GFX6-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x10000
+; GFX6-NEXT: v_mov_b32_e32 v4, s7
+; GFX6-NEXT: s_lshr_b32 s40, s4, 14
+; GFX6-NEXT: v_mov_b32_e32 v0, s44
+; GFX6-NEXT: v_mov_b32_e32 v1, s45
+; GFX6-NEXT: s_mov_b32 s44, s5
+; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
+; GFX6-NEXT: v_mov_b32_e32 v6, s44
+; GFX6-NEXT: v_mov_b32_e32 v7, s45
+; GFX6-NEXT: s_lshr_b32 s44, s4, 15
+; GFX6-NEXT: v_mov_b32_e32 v2, s42
+; GFX6-NEXT: v_mov_b32_e32 v3, s43
+; GFX6-NEXT: s_lshr_b32 s42, s4, 12
; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[72:73], s[4:5], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX6-NEXT: v_mov_b32_e32 v8, s36
+; GFX6-NEXT: v_mov_b32_e32 v9, s37
+; GFX6-NEXT: s_lshr_b32 s36, s4, 13
+; GFX6-NEXT: v_mov_b32_e32 v10, s38
+; GFX6-NEXT: v_mov_b32_e32 v11, s39
+; GFX6-NEXT: s_lshr_b32 s38, s4, 10
; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
+; GFX6-NEXT: v_mov_b32_e32 v12, s30
+; GFX6-NEXT: v_mov_b32_e32 v13, s31
+; GFX6-NEXT: s_lshr_b32 s30, s4, 11
+; GFX6-NEXT: v_mov_b32_e32 v14, s34
+; GFX6-NEXT: v_mov_b32_e32 v15, s35
+; GFX6-NEXT: s_lshr_b32 s34, s4, 8
; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX6-NEXT: v_mov_b32_e32 v5, s7
+; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:496
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, s26
+; GFX6-NEXT: v_mov_b32_e32 v3, s27
+; GFX6-NEXT: s_lshr_b32 s26, s4, 9
+; GFX6-NEXT: v_mov_b32_e32 v4, s28
+; GFX6-NEXT: v_mov_b32_e32 v5, s29
+; GFX6-NEXT: s_lshr_b32 s28, s4, 6
; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:240
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v8, s22
+; GFX6-NEXT: v_mov_b32_e32 v9, s23
+; GFX6-NEXT: s_lshr_b32 s22, s4, 7
+; GFX6-NEXT: v_mov_b32_e32 v10, s24
+; GFX6-NEXT: v_mov_b32_e32 v11, s25
+; GFX6-NEXT: s_lshr_b32 s24, s4, 4
; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[74:75], s[18:19], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[18:19], s[62:63], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:224
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v12, s18
+; GFX6-NEXT: v_mov_b32_e32 v13, s19
+; GFX6-NEXT: s_lshr_b32 s18, s4, 5
+; GFX6-NEXT: v_mov_b32_e32 v14, s20
+; GFX6-NEXT: v_mov_b32_e32 v15, s21
+; GFX6-NEXT: s_lshr_b32 s20, s4, 2
; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:208
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, s14
+; GFX6-NEXT: v_mov_b32_e32 v3, s15
+; GFX6-NEXT: s_lshr_b32 s14, s4, 3
+; GFX6-NEXT: v_mov_b32_e32 v4, s16
+; GFX6-NEXT: v_mov_b32_e32 v5, s17
+; GFX6-NEXT: s_lshr_b32 s16, s4, 1
; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v8, s10
+; GFX6-NEXT: v_mov_b32_e32 v9, s11
+; GFX6-NEXT: s_lshr_b32 s10, s5, 29
+; GFX6-NEXT: v_mov_b32_e32 v10, s12
+; GFX6-NEXT: v_mov_b32_e32 v11, s13
+; GFX6-NEXT: s_lshr_b32 s12, s5, 28
; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112
-; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96
-; GFX6-NEXT: s_waitcnt expcnt(1)
-; GFX6-NEXT: v_mov_b32_e32 v6, s68
-; GFX6-NEXT: v_mov_b32_e32 v7, s69
+; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v8, s70
-; GFX6-NEXT: v_mov_b32_e32 v9, s71
+; GFX6-NEXT: v_mov_b32_e32 v12, s6
+; GFX6-NEXT: v_mov_b32_e32 v13, s7
+; GFX6-NEXT: s_lshr_b32 s6, s5, 26
+; GFX6-NEXT: v_mov_b32_e32 v14, s8
+; GFX6-NEXT: v_mov_b32_e32 v15, s9
+; GFX6-NEXT: s_lshr_b32 s8, s5, 27
+; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:160
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, s40
+; GFX6-NEXT: v_mov_b32_e32 v3, s41
+; GFX6-NEXT: s_lshr_b32 s40, s5, 25
+; GFX6-NEXT: v_mov_b32_e32 v4, s44
+; GFX6-NEXT: v_mov_b32_e32 v5, s45
+; GFX6-NEXT: s_lshr_b32 s44, s5, 24
+; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:144
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v8, s42
+; GFX6-NEXT: v_mov_b32_e32 v9, s43
+; GFX6-NEXT: s_lshr_b32 s42, s5, 22
; GFX6-NEXT: v_mov_b32_e32 v10, s36
-; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
-; GFX6-NEXT: v_mov_b32_e32 v4, s72
-; GFX6-NEXT: v_mov_b32_e32 v5, s73
; GFX6-NEXT: v_mov_b32_e32 v11, s37
-; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64
-; GFX6-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
-; GFX6-NEXT: s_waitcnt expcnt(2)
+; GFX6-NEXT: s_lshr_b32 s36, s5, 23
+; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v12, s38
+; GFX6-NEXT: v_mov_b32_e32 v13, s39
+; GFX6-NEXT: s_lshr_b32 s38, s5, 20
; GFX6-NEXT: v_mov_b32_e32 v14, s30
-; GFX6-NEXT: v_mov_b32_e32 v12, s26
-; GFX6-NEXT: v_mov_b32_e32 v13, s27
; GFX6-NEXT: v_mov_b32_e32 v15, s31
-; GFX6-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32
-; GFX6-NEXT: s_waitcnt expcnt(2)
-; GFX6-NEXT: v_mov_b32_e32 v18, s34
-; GFX6-NEXT: v_mov_b32_e32 v16, s28
-; GFX6-NEXT: v_mov_b32_e32 v17, s29
-; GFX6-NEXT: v_mov_b32_e32 v19, s35
-; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16
+; GFX6-NEXT: s_lshr_b32 s4, s5, 21
+; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[30:31], s[34:35], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, s24
-; GFX6-NEXT: v_mov_b32_e32 v20, s22
-; GFX6-NEXT: v_mov_b32_e32 v21, s23
-; GFX6-NEXT: v_mov_b32_e32 v7, s25
-; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0
+; GFX6-NEXT: v_mov_b32_e32 v2, s30
+; GFX6-NEXT: v_mov_b32_e32 v3, s31
+; GFX6-NEXT: s_lshr_b32 s30, s5, 18
+; GFX6-NEXT: v_mov_b32_e32 v4, s26
+; GFX6-NEXT: v_mov_b32_e32 v5, s27
+; GFX6-NEXT: s_lshr_b32 s26, s5, 19
+; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: v_mov_b32_e32 v8, s74
-; GFX6-NEXT: v_mov_b32_e32 v9, s75
-; GFX6-NEXT: v_mov_b32_e32 v3, s21
-; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:480
-; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:464
-; GFX6-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:448
-; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:432
-; GFX6-NEXT: v_mov_b32_e32 v4, s8
-; GFX6-NEXT: v_mov_b32_e32 v5, s9
-; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:416
+; GFX6-NEXT: v_mov_b32_e32 v8, s28
+; GFX6-NEXT: v_mov_b32_e32 v9, s29
+; GFX6-NEXT: s_lshr_b32 s28, s5, 17
+; GFX6-NEXT: v_mov_b32_e32 v10, s22
+; GFX6-NEXT: v_mov_b32_e32 v11, s23
+; GFX6-NEXT: s_lshr_b32 s22, s5, 16
+; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s10
-; GFX6-NEXT: v_mov_b32_e32 v3, s11
-; GFX6-NEXT: v_mov_b32_e32 v4, s12
-; GFX6-NEXT: v_mov_b32_e32 v5, s13
-; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:400
+; GFX6-NEXT: v_mov_b32_e32 v12, s24
+; GFX6-NEXT: v_mov_b32_e32 v13, s25
+; GFX6-NEXT: s_lshr_b32 s24, s5, 14
+; GFX6-NEXT: v_mov_b32_e32 v14, s18
+; GFX6-NEXT: v_mov_b32_e32 v15, s19
+; GFX6-NEXT: s_lshr_b32 s18, s5, 15
+; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:64
+; GFX6-NEXT: v_mov_b32_e32 v16, s20
+; GFX6-NEXT: v_mov_b32_e32 v17, s21
+; GFX6-NEXT: s_lshr_b32 s20, s5, 12
+; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
+; GFX6-NEXT: v_mov_b32_e32 v18, s14
+; GFX6-NEXT: v_mov_b32_e32 v19, s15
+; GFX6-NEXT: s_lshr_b32 s14, s5, 13
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s16
; GFX6-NEXT: v_mov_b32_e32 v3, s17
-; GFX6-NEXT: v_mov_b32_e32 v4, s14
-; GFX6-NEXT: v_mov_b32_e32 v5, s15
-; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:384
+; GFX6-NEXT: s_lshr_b32 s16, s5, 10
+; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s38
-; GFX6-NEXT: v_mov_b32_e32 v3, s39
-; GFX6-NEXT: v_mov_b32_e32 v4, s40
-; GFX6-NEXT: v_mov_b32_e32 v5, s41
-; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:368
+; GFX6-NEXT: v_mov_b32_e32 v8, s12
+; GFX6-NEXT: v_mov_b32_e32 v9, s13
+; GFX6-NEXT: s_lshr_b32 s12, s5, 11
+; GFX6-NEXT: v_mov_b32_e32 v10, s10
+; GFX6-NEXT: v_mov_b32_e32 v11, s11
+; GFX6-NEXT: s_lshr_b32 s10, s5, 8
+; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s42
-; GFX6-NEXT: v_mov_b32_e32 v3, s43
-; GFX6-NEXT: v_mov_b32_e32 v4, s44
-; GFX6-NEXT: v_mov_b32_e32 v5, s45
-; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:352
+; GFX6-NEXT: v_mov_b32_e32 v12, s6
+; GFX6-NEXT: v_mov_b32_e32 v13, s7
+; GFX6-NEXT: s_lshr_b32 s6, s5, 9
+; GFX6-NEXT: v_mov_b32_e32 v14, s8
+; GFX6-NEXT: v_mov_b32_e32 v15, s9
+; GFX6-NEXT: s_lshr_b32 s8, s5, 6
+; GFX6-NEXT: s_bfe_i64 s[34:35], s[44:45], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s46
-; GFX6-NEXT: v_mov_b32_e32 v3, s47
-; GFX6-NEXT: v_mov_b32_e32 v4, s48
-; GFX6-NEXT: v_mov_b32_e32 v5, s49
-; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:336
+; GFX6-NEXT: v_mov_b32_e32 v16, s34
+; GFX6-NEXT: v_mov_b32_e32 v17, s35
+; GFX6-NEXT: s_lshr_b32 s34, s5, 7
+; GFX6-NEXT: v_mov_b32_e32 v18, s40
+; GFX6-NEXT: v_mov_b32_e32 v19, s41
+; GFX6-NEXT: s_lshr_b32 s40, s5, 4
+; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s50
-; GFX6-NEXT: v_mov_b32_e32 v3, s51
-; GFX6-NEXT: v_mov_b32_e32 v4, s52
-; GFX6-NEXT: v_mov_b32_e32 v5, s53
-; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:320
+; GFX6-NEXT: v_mov_b32_e32 v0, s42
+; GFX6-NEXT: v_mov_b32_e32 v1, s43
+; GFX6-NEXT: s_lshr_b32 s42, s5, 5
+; GFX6-NEXT: v_mov_b32_e32 v2, s36
+; GFX6-NEXT: v_mov_b32_e32 v3, s37
+; GFX6-NEXT: s_lshr_b32 s36, s5, 2
+; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:480
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s54
-; GFX6-NEXT: v_mov_b32_e32 v3, s55
-; GFX6-NEXT: v_mov_b32_e32 v4, s56
-; GFX6-NEXT: v_mov_b32_e32 v5, s57
-; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:304
+; GFX6-NEXT: v_mov_b32_e32 v8, s38
+; GFX6-NEXT: v_mov_b32_e32 v9, s39
+; GFX6-NEXT: s_lshr_b32 s38, s5, 3
+; GFX6-NEXT: s_lshr_b32 s44, s5, 1
+; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:464
+; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:448
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:432
+; GFX6-NEXT: v_mov_b32_e32 v10, s4
+; GFX6-NEXT: v_mov_b32_e32 v11, s5
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:416
+; GFX6-NEXT: s_waitcnt expcnt(1)
+; GFX6-NEXT: v_mov_b32_e32 v0, s30
+; GFX6-NEXT: v_mov_b32_e32 v1, s31
+; GFX6-NEXT: v_mov_b32_e32 v2, s26
+; GFX6-NEXT: v_mov_b32_e32 v3, s27
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:400
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s58
-; GFX6-NEXT: v_mov_b32_e32 v3, s59
-; GFX6-NEXT: v_mov_b32_e32 v4, s60
-; GFX6-NEXT: v_mov_b32_e32 v5, s61
-; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:288
+; GFX6-NEXT: v_mov_b32_e32 v0, s22
+; GFX6-NEXT: v_mov_b32_e32 v1, s23
+; GFX6-NEXT: v_mov_b32_e32 v2, s28
+; GFX6-NEXT: v_mov_b32_e32 v3, s29
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:384
; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s24
+; GFX6-NEXT: v_mov_b32_e32 v1, s25
; GFX6-NEXT: v_mov_b32_e32 v2, s18
; GFX6-NEXT: v_mov_b32_e32 v3, s19
-; GFX6-NEXT: v_mov_b32_e32 v4, s6
-; GFX6-NEXT: v_mov_b32_e32 v5, s7
-; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:272
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:368
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: v_mov_b32_e32 v3, s5
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256
+; GFX6-NEXT: v_mov_b32_e32 v0, s20
+; GFX6-NEXT: v_mov_b32_e32 v1, s21
+; GFX6-NEXT: v_mov_b32_e32 v2, s14
+; GFX6-NEXT: v_mov_b32_e32 v3, s15
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:352
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s16
+; GFX6-NEXT: v_mov_b32_e32 v1, s17
+; GFX6-NEXT: v_mov_b32_e32 v2, s12
+; GFX6-NEXT: v_mov_b32_e32 v3, s13
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:336
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s10
+; GFX6-NEXT: v_mov_b32_e32 v1, s11
+; GFX6-NEXT: v_mov_b32_e32 v2, s6
+; GFX6-NEXT: v_mov_b32_e32 v3, s7
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:320
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s8
+; GFX6-NEXT: v_mov_b32_e32 v1, s9
+; GFX6-NEXT: v_mov_b32_e32 v2, s34
+; GFX6-NEXT: v_mov_b32_e32 v3, s35
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:304
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s40
+; GFX6-NEXT: v_mov_b32_e32 v1, s41
+; GFX6-NEXT: v_mov_b32_e32 v2, s42
+; GFX6-NEXT: v_mov_b32_e32 v3, s43
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:288
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s36
+; GFX6-NEXT: v_mov_b32_e32 v1, s37
+; GFX6-NEXT: v_mov_b32_e32 v2, s38
+; GFX6-NEXT: v_mov_b32_e32 v3, s39
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:272
+; GFX6-NEXT: v_mov_b32_e32 v8, s44
+; GFX6-NEXT: v_mov_b32_e32 v9, s45
+; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:256
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: constant_sextload_v64i1_to_v64i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
-; GFX8-NEXT: s_mov_b32 s69, 0
-; GFX8-NEXT: s_mov_b32 s67, s69
-; GFX8-NEXT: s_mov_b32 s41, s69
-; GFX8-NEXT: s_mov_b32 s61, s69
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NEXT: s_mov_b32 s43, s69
-; GFX8-NEXT: s_mov_b32 s65, s69
-; GFX8-NEXT: s_mov_b32 s45, s69
-; GFX8-NEXT: s_mov_b32 s57, s69
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshr_b32 s4, s3, 8
; GFX8-NEXT: s_lshr_b32 s48, s3, 15
@@ -10210,15 +10175,14 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: s_lshr_b32 s36, s3, 27
; GFX8-NEXT: s_lshr_b32 s68, s3, 24
; GFX8-NEXT: s_lshr_b32 s38, s3, 25
-; GFX8-NEXT: s_lshr_b32 s66, s3, 22
+; GFX8-NEXT: s_lshr_b32 s64, s3, 22
; GFX8-NEXT: s_lshr_b32 s40, s3, 23
; GFX8-NEXT: s_lshr_b32 s60, s3, 20
; GFX8-NEXT: s_lshr_b32 s42, s3, 21
-; GFX8-NEXT: s_lshr_b32 s64, s3, 18
+; GFX8-NEXT: s_lshr_b32 s66, s3, 18
; GFX8-NEXT: s_lshr_b32 s44, s3, 19
; GFX8-NEXT: s_lshr_b32 s56, s3, 16
; GFX8-NEXT: s_lshr_b32 s46, s3, 17
-; GFX8-NEXT: s_mov_b32 s47, s69
; GFX8-NEXT: s_lshr_b32 s58, s3, 14
; GFX8-NEXT: s_lshr_b32 s62, s3, 12
; GFX8-NEXT: s_lshr_b32 s54, s3, 10
@@ -10227,14 +10191,13 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000
; GFX8-NEXT: s_lshr_b32 s52, s3, 11
; GFX8-NEXT: v_writelane_b32 v62, s4, 2
-; GFX8-NEXT: s_mov_b32 s23, s69
; GFX8-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000
; GFX8-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000
; GFX8-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000
; GFX8-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000
; GFX8-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000
; GFX8-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000
; GFX8-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000
; GFX8-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000
@@ -10266,9 +10229,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v12, s72
; GFX8-NEXT: v_mov_b32_e32 v0, s70
; GFX8-NEXT: v_mov_b32_e32 v8, s68
-; GFX8-NEXT: v_mov_b32_e32 v16, s66
+; GFX8-NEXT: v_mov_b32_e32 v16, s64
; GFX8-NEXT: v_mov_b32_e32 v20, s60
-; GFX8-NEXT: v_mov_b32_e32 v24, s64
+; GFX8-NEXT: v_mov_b32_e32 v24, s66
; GFX8-NEXT: v_mov_b32_e32 v28, s56
; GFX8-NEXT: v_mov_b32_e32 v32, s58
; GFX8-NEXT: v_mov_b32_e32 v36, s62
@@ -10329,11 +10292,11 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v3, s37
; GFX8-NEXT: v_mov_b32_e32 v9, s69
; GFX8-NEXT: v_mov_b32_e32 v11, s39
-; GFX8-NEXT: v_mov_b32_e32 v17, s67
+; GFX8-NEXT: v_mov_b32_e32 v17, s65
; GFX8-NEXT: v_mov_b32_e32 v19, s41
; GFX8-NEXT: v_mov_b32_e32 v21, s61
; GFX8-NEXT: v_mov_b32_e32 v23, s43
-; GFX8-NEXT: v_mov_b32_e32 v25, s65
+; GFX8-NEXT: v_mov_b32_e32 v25, s67
; GFX8-NEXT: v_mov_b32_e32 v27, s45
; GFX8-NEXT: v_mov_b32_e32 v29, s57
; GFX8-NEXT: v_mov_b32_e32 v31, s47
@@ -10999,59 +10962,48 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-LABEL: constant_sextload_v64i1_to_v64i64:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT: s_mov_b32 s67, 0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s57, s67
-; GFX12-NEXT: s_mov_b32 s63, s67
-; GFX12-NEXT: s_mov_b32 s45, s67
-; GFX12-NEXT: s_mov_b32 s53, s67
-; GFX12-NEXT: s_mov_b32 s31, s67
-; GFX12-NEXT: s_mov_b32 s41, s67
-; GFX12-NEXT: s_mov_b32 s19, s67
-; GFX12-NEXT: s_mov_b32 s27, s67
-; GFX12-NEXT: s_mov_b32 s47, s67
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[10:11], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s96, s3, 30
-; GFX12-NEXT: s_lshr_b32 s98, s3, 31
-; GFX12-NEXT: s_lshr_b32 s92, s3, 28
-; GFX12-NEXT: s_lshr_b32 s94, s3, 29
-; GFX12-NEXT: s_lshr_b32 s78, s3, 26
-; GFX12-NEXT: s_lshr_b32 s88, s3, 27
+; GFX12-NEXT: s_lshr_b32 s96, s11, 30
+; GFX12-NEXT: s_lshr_b32 s98, s11, 31
+; GFX12-NEXT: s_lshr_b32 s92, s11, 28
+; GFX12-NEXT: s_lshr_b32 s94, s11, 29
+; GFX12-NEXT: s_lshr_b32 s78, s11, 26
+; GFX12-NEXT: s_lshr_b32 s88, s11, 27
; GFX12-NEXT: s_bfe_i64 s[96:97], s[96:97], 0x10000
; GFX12-NEXT: s_bfe_i64 s[100:101], s[98:99], 0x10000
-; GFX12-NEXT: s_lshr_b32 s66, s3, 24
-; GFX12-NEXT: s_lshr_b32 s74, s3, 25
+; GFX12-NEXT: s_lshr_b32 s66, s11, 24
+; GFX12-NEXT: s_lshr_b32 s74, s11, 25
; GFX12-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000
; GFX12-NEXT: s_bfe_i64 s[94:95], s[94:95], 0x10000
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s96
-; GFX12-NEXT: s_lshr_b32 s56, s3, 22
-; GFX12-NEXT: s_lshr_b32 s62, s3, 23
+; GFX12-NEXT: s_lshr_b32 s56, s11, 22
+; GFX12-NEXT: s_lshr_b32 s62, s11, 23
; GFX12-NEXT: v_dual_mov_b32 v2, s97 :: v_dual_mov_b32 v3, s100
; GFX12-NEXT: v_dual_mov_b32 v4, s101 :: v_dual_mov_b32 v5, s92
; GFX12-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000
; GFX12-NEXT: s_bfe_i64 s[88:89], s[88:89], 0x10000
-; GFX12-NEXT: s_lshr_b32 s44, s3, 20
-; GFX12-NEXT: s_lshr_b32 s52, s3, 21
-; GFX12-NEXT: s_lshr_b32 s30, s3, 18
-; GFX12-NEXT: s_lshr_b32 s40, s3, 19
-; GFX12-NEXT: s_lshr_b32 s18, s3, 16
-; GFX12-NEXT: s_lshr_b32 s26, s3, 17
-; GFX12-NEXT: s_lshr_b32 s4, s3, 14
-; GFX12-NEXT: s_lshr_b32 s6, s3, 15
+; GFX12-NEXT: s_lshr_b32 s44, s11, 20
+; GFX12-NEXT: s_lshr_b32 s52, s11, 21
+; GFX12-NEXT: s_lshr_b32 s30, s11, 18
+; GFX12-NEXT: s_lshr_b32 s40, s11, 19
+; GFX12-NEXT: s_lshr_b32 s18, s11, 16
+; GFX12-NEXT: s_lshr_b32 s26, s11, 17
+; GFX12-NEXT: s_lshr_b32 s2, s11, 14
+; GFX12-NEXT: s_lshr_b32 s4, s11, 15
; GFX12-NEXT: v_dual_mov_b32 v6, s93 :: v_dual_mov_b32 v7, s94
; GFX12-NEXT: v_dual_mov_b32 v8, s95 :: v_dual_mov_b32 v9, s78
; GFX12-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000
; GFX12-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000
-; GFX12-NEXT: s_lshr_b32 s8, s3, 12
-; GFX12-NEXT: s_lshr_b32 s10, s3, 13
+; GFX12-NEXT: s_lshr_b32 s6, s11, 12
+; GFX12-NEXT: s_lshr_b32 s8, s11, 13
; GFX12-NEXT: v_dual_mov_b32 v10, s79 :: v_dual_mov_b32 v11, s88
; GFX12-NEXT: v_dual_mov_b32 v12, s89 :: v_dual_mov_b32 v13, s66
; GFX12-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000
; GFX12-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000
-; GFX12-NEXT: s_lshr_b32 s12, s3, 10
-; GFX12-NEXT: s_lshr_b32 s14, s3, 11
+; GFX12-NEXT: s_lshr_b32 s12, s11, 10
+; GFX12-NEXT: s_lshr_b32 s14, s11, 11
; GFX12-NEXT: v_dual_mov_b32 v14, s67 :: v_dual_mov_b32 v15, s74
; GFX12-NEXT: v_dual_mov_b32 v16, s75 :: v_dual_mov_b32 v17, s56
; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
@@ -11060,16 +11012,16 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
-; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GFX12-NEXT: s_lshr_b32 s16, s3, 8
-; GFX12-NEXT: s_lshr_b32 s20, s3, 9
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX12-NEXT: s_lshr_b32 s16, s11, 8
+; GFX12-NEXT: s_lshr_b32 s20, s11, 9
; GFX12-NEXT: v_dual_mov_b32 v18, s57 :: v_dual_mov_b32 v19, s62
; GFX12-NEXT: v_dual_mov_b32 v20, s63 :: v_dual_mov_b32 v21, s44
-; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
-; GFX12-NEXT: s_lshr_b32 s22, s3, 6
-; GFX12-NEXT: s_lshr_b32 s24, s3, 7
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX12-NEXT: s_lshr_b32 s22, s11, 6
+; GFX12-NEXT: s_lshr_b32 s24, s11, 7
; GFX12-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v23, s52
; GFX12-NEXT: v_dual_mov_b32 v24, s53 :: v_dual_mov_b32 v25, s30
; GFX12-NEXT: v_dual_mov_b32 v26, s31 :: v_dual_mov_b32 v27, s40
@@ -11087,39 +11039,39 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:416
; GFX12-NEXT: global_store_b128 v0, v[25:28], s[0:1] offset:400
; GFX12-NEXT: global_store_b128 v0, v[29:32], s[0:1] offset:384
-; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s5
-; GFX12-NEXT: v_dual_mov_b32 v3, s6 :: v_dual_mov_b32 v4, s7
-; GFX12-NEXT: v_mov_b32_e32 v5, s8
-; GFX12-NEXT: s_lshr_b32 s28, s3, 4
-; GFX12-NEXT: s_lshr_b32 s34, s3, 5
-; GFX12-NEXT: s_lshr_b32 s36, s3, 2
-; GFX12-NEXT: s_lshr_b32 s38, s3, 3
+; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
+; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
+; GFX12-NEXT: v_mov_b32_e32 v5, s6
+; GFX12-NEXT: s_lshr_b32 s28, s11, 4
+; GFX12-NEXT: s_lshr_b32 s34, s11, 5
+; GFX12-NEXT: s_lshr_b32 s36, s11, 2
+; GFX12-NEXT: s_lshr_b32 s38, s11, 3
; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
-; GFX12-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX12-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX12-NEXT: s_lshr_b32 s42, s3, 1
-; GFX12-NEXT: s_mov_b32 s46, s3
+; GFX12-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
+; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s12
+; GFX12-NEXT: s_lshr_b32 s42, s11, 1
+; GFX12-NEXT: s_mov_b32 s46, s11
; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
; GFX12-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
; GFX12-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX12-NEXT: s_lshr_b32 s48, s2, 30
-; GFX12-NEXT: s_lshr_b32 s50, s2, 31
+; GFX12-NEXT: s_lshr_b32 s48, s10, 30
+; GFX12-NEXT: s_lshr_b32 s50, s10, 31
; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
; GFX12-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s20
; GFX12-NEXT: v_dual_mov_b32 v16, s21 :: v_dual_mov_b32 v17, s22
-; GFX12-NEXT: s_lshr_b32 s54, s2, 28
-; GFX12-NEXT: s_lshr_b32 s58, s2, 29
+; GFX12-NEXT: s_lshr_b32 s54, s10, 28
+; GFX12-NEXT: s_lshr_b32 s58, s10, 29
; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000
; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
; GFX12-NEXT: v_dual_mov_b32 v18, s23 :: v_dual_mov_b32 v19, s24
; GFX12-NEXT: v_dual_mov_b32 v20, s25 :: v_dual_mov_b32 v21, s28
-; GFX12-NEXT: s_lshr_b32 s60, s2, 26
-; GFX12-NEXT: s_lshr_b32 s64, s2, 27
+; GFX12-NEXT: s_lshr_b32 s60, s10, 26
+; GFX12-NEXT: s_lshr_b32 s64, s10, 27
; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000
; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000
; GFX12-NEXT: v_dual_mov_b32 v22, s29 :: v_dual_mov_b32 v23, s34
@@ -11134,43 +11086,43 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v1, s36 :: v_dual_mov_b32 v2, s37
; GFX12-NEXT: v_dual_mov_b32 v3, s38 :: v_dual_mov_b32 v4, s39
; GFX12-NEXT: v_mov_b32_e32 v5, s46
-; GFX12-NEXT: s_lshr_b32 s68, s2, 24
-; GFX12-NEXT: s_lshr_b32 s70, s2, 25
-; GFX12-NEXT: s_lshr_b32 s72, s2, 22
-; GFX12-NEXT: s_lshr_b32 s76, s2, 23
+; GFX12-NEXT: s_lshr_b32 s68, s10, 24
+; GFX12-NEXT: s_lshr_b32 s70, s10, 25
+; GFX12-NEXT: s_lshr_b32 s72, s10, 22
+; GFX12-NEXT: s_lshr_b32 s76, s10, 23
; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000
; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000
; GFX12-NEXT: v_dual_mov_b32 v6, s47 :: v_dual_mov_b32 v7, s42
; GFX12-NEXT: v_dual_mov_b32 v8, s43 :: v_dual_mov_b32 v9, s48
-; GFX12-NEXT: s_lshr_b32 s80, s2, 20
-; GFX12-NEXT: s_lshr_b32 s82, s2, 21
+; GFX12-NEXT: s_lshr_b32 s80, s10, 20
+; GFX12-NEXT: s_lshr_b32 s82, s10, 21
; GFX12-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000
; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000
; GFX12-NEXT: v_dual_mov_b32 v10, s49 :: v_dual_mov_b32 v11, s50
; GFX12-NEXT: v_dual_mov_b32 v12, s51 :: v_dual_mov_b32 v13, s54
-; GFX12-NEXT: s_lshr_b32 s84, s2, 18
-; GFX12-NEXT: s_lshr_b32 s86, s2, 19
+; GFX12-NEXT: s_lshr_b32 s84, s10, 18
+; GFX12-NEXT: s_lshr_b32 s86, s10, 19
; GFX12-NEXT: s_bfe_i64 s[76:77], s[76:77], 0x10000
; GFX12-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000
; GFX12-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000
; GFX12-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000
; GFX12-NEXT: v_dual_mov_b32 v14, s55 :: v_dual_mov_b32 v15, s58
; GFX12-NEXT: v_dual_mov_b32 v16, s59 :: v_dual_mov_b32 v17, s60
-; GFX12-NEXT: s_lshr_b32 s90, s2, 16
-; GFX12-NEXT: s_lshr_b32 s98, s2, 17
+; GFX12-NEXT: s_lshr_b32 s90, s10, 16
+; GFX12-NEXT: s_lshr_b32 s98, s10, 17
; GFX12-NEXT: s_bfe_i64 s[82:83], s[82:83], 0x10000
; GFX12-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000
; GFX12-NEXT: v_dual_mov_b32 v18, s61 :: v_dual_mov_b32 v19, s64
; GFX12-NEXT: v_dual_mov_b32 v20, s65 :: v_dual_mov_b32 v21, s68
-; GFX12-NEXT: s_lshr_b32 s96, s2, 14
-; GFX12-NEXT: s_lshr_b32 s100, s2, 15
-; GFX12-NEXT: s_lshr_b32 s94, s2, 13
-; GFX12-NEXT: s_lshr_b32 s88, s2, 11
-; GFX12-NEXT: s_lshr_b32 s74, s2, 9
-; GFX12-NEXT: s_lshr_b32 s62, s2, 7
-; GFX12-NEXT: s_lshr_b32 s52, s2, 5
-; GFX12-NEXT: s_lshr_b32 s40, s2, 3
-; GFX12-NEXT: s_lshr_b32 s26, s2, 1
+; GFX12-NEXT: s_lshr_b32 s96, s10, 14
+; GFX12-NEXT: s_lshr_b32 s100, s10, 15
+; GFX12-NEXT: s_lshr_b32 s94, s10, 13
+; GFX12-NEXT: s_lshr_b32 s88, s10, 11
+; GFX12-NEXT: s_lshr_b32 s74, s10, 9
+; GFX12-NEXT: s_lshr_b32 s62, s10, 7
+; GFX12-NEXT: s_lshr_b32 s52, s10, 5
+; GFX12-NEXT: s_lshr_b32 s40, s10, 3
+; GFX12-NEXT: s_lshr_b32 s26, s10, 1
; GFX12-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000
; GFX12-NEXT: s_bfe_i64 s[84:85], s[84:85], 0x10000
; GFX12-NEXT: v_dual_mov_b32 v22, s69 :: v_dual_mov_b32 v23, s70
@@ -11185,19 +11137,19 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v1, s72 :: v_dual_mov_b32 v2, s73
; GFX12-NEXT: v_dual_mov_b32 v3, s76 :: v_dual_mov_b32 v4, s77
; GFX12-NEXT: v_mov_b32_e32 v5, s80
-; GFX12-NEXT: s_lshr_b32 s92, s2, 12
-; GFX12-NEXT: s_lshr_b32 s78, s2, 10
+; GFX12-NEXT: s_lshr_b32 s92, s10, 12
+; GFX12-NEXT: s_lshr_b32 s78, s10, 10
; GFX12-NEXT: s_bfe_i64 s[98:99], s[98:99], 0x10000
; GFX12-NEXT: s_bfe_i64 s[90:91], s[90:91], 0x10000
; GFX12-NEXT: v_dual_mov_b32 v6, s81 :: v_dual_mov_b32 v7, s82
; GFX12-NEXT: v_dual_mov_b32 v8, s83 :: v_dual_mov_b32 v9, s84
-; GFX12-NEXT: s_lshr_b32 s66, s2, 8
-; GFX12-NEXT: s_lshr_b32 s56, s2, 6
-; GFX12-NEXT: s_lshr_b32 s44, s2, 4
-; GFX12-NEXT: s_lshr_b32 s30, s2, 2
-; GFX12-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x10000
+; GFX12-NEXT: s_lshr_b32 s66, s10, 8
+; GFX12-NEXT: s_lshr_b32 s56, s10, 6
+; GFX12-NEXT: s_lshr_b32 s44, s10, 4
+; GFX12-NEXT: s_lshr_b32 s30, s10, 2
+; GFX12-NEXT: s_bfe_i64 s[18:19], s[10:11], 0x10000
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[26:27], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[10:11], s[26:27], 0x10000
; GFX12-NEXT: s_bfe_i64 s[26:27], s[40:41], 0x10000
; GFX12-NEXT: s_bfe_i64 s[40:41], s[52:53], 0x10000
; GFX12-NEXT: s_bfe_i64 s[52:53], s[62:63], 0x10000
@@ -11241,8 +11193,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v16, s41 :: v_dual_mov_b32 v17, s30
; GFX12-NEXT: v_dual_mov_b32 v18, s31 :: v_dual_mov_b32 v19, s26
; GFX12-NEXT: v_dual_mov_b32 v20, s27 :: v_dual_mov_b32 v21, s18
-; GFX12-NEXT: v_dual_mov_b32 v22, s19 :: v_dual_mov_b32 v23, s2
-; GFX12-NEXT: v_mov_b32_e32 v24, s3
+; GFX12-NEXT: v_dual_mov_b32 v22, s19 :: v_dual_mov_b32 v23, s10
+; GFX12-NEXT: v_mov_b32_e32 v24, s11
; GFX12-NEXT: s_clause 0x5
; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:80
; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:64
@@ -11255,59 +11207,48 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX1250-LABEL: constant_sextload_v64i1_to_v64i64:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: s_mov_b32 s67, 0
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: s_mov_b32 s57, s67
-; GFX1250-NEXT: s_mov_b32 s63, s67
-; GFX1250-NEXT: s_mov_b32 s45, s67
-; GFX1250-NEXT: s_mov_b32 s53, s67
-; GFX1250-NEXT: s_mov_b32 s31, s67
-; GFX1250-NEXT: s_mov_b32 s41, s67
-; GFX1250-NEXT: s_mov_b32 s19, s67
-; GFX1250-NEXT: s_mov_b32 s27, s67
-; GFX1250-NEXT: s_mov_b32 s47, s67
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1250-NEXT: s_load_b64 s[10:11], s[2:3], 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_lshr_b32 s96, s3, 30
-; GFX1250-NEXT: s_lshr_b32 s98, s3, 31
-; GFX1250-NEXT: s_lshr_b32 s92, s3, 28
-; GFX1250-NEXT: s_lshr_b32 s94, s3, 29
-; GFX1250-NEXT: s_lshr_b32 s78, s3, 26
-; GFX1250-NEXT: s_lshr_b32 s88, s3, 27
+; GFX1250-NEXT: s_lshr_b32 s96, s11, 30
+; GFX1250-NEXT: s_lshr_b32 s98, s11, 31
+; GFX1250-NEXT: s_lshr_b32 s92, s11, 28
+; GFX1250-NEXT: s_lshr_b32 s94, s11, 29
+; GFX1250-NEXT: s_lshr_b32 s78, s11, 26
+; GFX1250-NEXT: s_lshr_b32 s88, s11, 27
; GFX1250-NEXT: s_bfe_i64 s[96:97], s[96:97], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[100:101], s[98:99], 0x10000
-; GFX1250-NEXT: s_lshr_b32 s66, s3, 24
-; GFX1250-NEXT: s_lshr_b32 s74, s3, 25
+; GFX1250-NEXT: s_lshr_b32 s66, s11, 24
+; GFX1250-NEXT: s_lshr_b32 s74, s11, 25
; GFX1250-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[94:95], s[94:95], 0x10000
; GFX1250-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v0, s96
-; GFX1250-NEXT: s_lshr_b32 s56, s3, 22
-; GFX1250-NEXT: s_lshr_b32 s62, s3, 23
+; GFX1250-NEXT: s_lshr_b32 s56, s11, 22
+; GFX1250-NEXT: s_lshr_b32 s62, s11, 23
; GFX1250-NEXT: v_dual_mov_b32 v1, s97 :: v_dual_mov_b32 v2, s100
; GFX1250-NEXT: v_dual_mov_b32 v3, s101 :: v_dual_mov_b32 v4, s92
; GFX1250-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[88:89], s[88:89], 0x10000
-; GFX1250-NEXT: s_lshr_b32 s44, s3, 20
-; GFX1250-NEXT: s_lshr_b32 s52, s3, 21
-; GFX1250-NEXT: s_lshr_b32 s30, s3, 18
-; GFX1250-NEXT: s_lshr_b32 s40, s3, 19
-; GFX1250-NEXT: s_lshr_b32 s18, s3, 16
-; GFX1250-NEXT: s_lshr_b32 s26, s3, 17
-; GFX1250-NEXT: s_lshr_b32 s4, s3, 14
-; GFX1250-NEXT: s_lshr_b32 s6, s3, 15
+; GFX1250-NEXT: s_lshr_b32 s44, s11, 20
+; GFX1250-NEXT: s_lshr_b32 s52, s11, 21
+; GFX1250-NEXT: s_lshr_b32 s30, s11, 18
+; GFX1250-NEXT: s_lshr_b32 s40, s11, 19
+; GFX1250-NEXT: s_lshr_b32 s18, s11, 16
+; GFX1250-NEXT: s_lshr_b32 s26, s11, 17
+; GFX1250-NEXT: s_lshr_b32 s2, s11, 14
+; GFX1250-NEXT: s_lshr_b32 s4, s11, 15
; GFX1250-NEXT: v_dual_mov_b32 v5, s93 :: v_dual_mov_b32 v6, s94
; GFX1250-NEXT: v_dual_mov_b32 v7, s95 :: v_dual_mov_b32 v10, s78
; GFX1250-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000
-; GFX1250-NEXT: s_lshr_b32 s8, s3, 12
-; GFX1250-NEXT: s_lshr_b32 s10, s3, 13
+; GFX1250-NEXT: s_lshr_b32 s6, s11, 12
+; GFX1250-NEXT: s_lshr_b32 s8, s11, 13
; GFX1250-NEXT: v_dual_mov_b32 v11, s79 :: v_dual_mov_b32 v12, s88
; GFX1250-NEXT: v_dual_mov_b32 v13, s89 :: v_dual_mov_b32 v14, s66
; GFX1250-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000
-; GFX1250-NEXT: s_lshr_b32 s12, s3, 10
-; GFX1250-NEXT: s_lshr_b32 s14, s3, 11
+; GFX1250-NEXT: s_lshr_b32 s12, s11, 10
+; GFX1250-NEXT: s_lshr_b32 s14, s11, 11
; GFX1250-NEXT: v_dual_mov_b32 v15, s67 :: v_dual_mov_b32 v16, s74
; GFX1250-NEXT: v_dual_mov_b32 v17, s75 :: v_dual_mov_b32 v18, s56
; GFX1250-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
@@ -11316,16 +11257,16 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX1250-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
-; GFX1250-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GFX1250-NEXT: s_lshr_b32 s16, s3, 8
-; GFX1250-NEXT: s_lshr_b32 s20, s3, 9
+; GFX1250-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX1250-NEXT: s_lshr_b32 s16, s11, 8
+; GFX1250-NEXT: s_lshr_b32 s20, s11, 9
; GFX1250-NEXT: v_dual_mov_b32 v19, s57 :: v_dual_mov_b32 v20, s62
; GFX1250-NEXT: v_dual_mov_b32 v21, s63 :: v_dual_mov_b32 v22, s44
-; GFX1250-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
-; GFX1250-NEXT: s_lshr_b32 s22, s3, 6
-; GFX1250-NEXT: s_lshr_b32 s24, s3, 7
+; GFX1250-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX1250-NEXT: s_lshr_b32 s22, s11, 6
+; GFX1250-NEXT: s_lshr_b32 s24, s11, 7
; GFX1250-NEXT: v_dual_mov_b32 v23, s45 :: v_dual_mov_b32 v24, s52
; GFX1250-NEXT: v_dual_mov_b32 v25, s53 :: v_dual_mov_b32 v26, s30
; GFX1250-NEXT: v_dual_mov_b32 v27, s31 :: v_dual_mov_b32 v28, s40
@@ -11344,28 +11285,28 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX1250-NEXT: global_store_b128 v8, v[26:29], s[0:1] offset:400
; GFX1250-NEXT: global_store_b128 v8, v[30:33], s[0:1] offset:384
; GFX1250-NEXT: s_wait_xcnt 0x7
-; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1250-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1250-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX1250-NEXT: s_wait_xcnt 0x6
-; GFX1250-NEXT: v_mov_b32_e32 v4, s8
-; GFX1250-NEXT: s_lshr_b32 s28, s3, 4
-; GFX1250-NEXT: s_lshr_b32 s34, s3, 5
-; GFX1250-NEXT: s_lshr_b32 s36, s3, 2
-; GFX1250-NEXT: s_lshr_b32 s38, s3, 3
+; GFX1250-NEXT: v_mov_b32_e32 v4, s6
+; GFX1250-NEXT: s_lshr_b32 s28, s11, 4
+; GFX1250-NEXT: s_lshr_b32 s34, s11, 5
+; GFX1250-NEXT: s_lshr_b32 s36, s11, 2
+; GFX1250-NEXT: s_lshr_b32 s38, s11, 3
; GFX1250-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
-; GFX1250-NEXT: v_dual_mov_b32 v5, s9 :: v_dual_mov_b32 v6, s10
+; GFX1250-NEXT: v_dual_mov_b32 v5, s7 :: v_dual_mov_b32 v6, s8
; GFX1250-NEXT: s_wait_xcnt 0x5
-; GFX1250-NEXT: v_dual_mov_b32 v7, s11 :: v_dual_mov_b32 v10, s12
-; GFX1250-NEXT: s_lshr_b32 s42, s3, 1
-; GFX1250-NEXT: s_mov_b32 s46, s3
+; GFX1250-NEXT: v_dual_mov_b32 v7, s9 :: v_dual_mov_b32 v10, s12
+; GFX1250-NEXT: s_lshr_b32 s42, s11, 1
+; GFX1250-NEXT: s_mov_b32 s46, s11
; GFX1250-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
; GFX1250-NEXT: v_dual_mov_b32 v11, s13 :: v_dual_mov_b32 v12, s14
; GFX1250-NEXT: s_wait_xcnt 0x4
; GFX1250-NEXT: v_dual_mov_b32 v13, s15 :: v_dual_mov_b32 v14, s16
-; GFX1250-NEXT: s_lshr_b32 s48, s2, 30
-; GFX1250-NEXT: s_lshr_b32 s50, s2, 31
+; GFX1250-NEXT: s_lshr_b32 s48, s10, 30
+; GFX1250-NEXT: s_lshr_b32 s50, s10, 31
; GFX1250-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
@@ -11373,15 +11314,15 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX1250-NEXT: v_dual_mov_b32 v15, s17 :: v_dual_mov_b32 v16, s20
; GFX1250-NEXT: s_wait_xcnt 0x3
; GFX1250-NEXT: v_dual_mov_b32 v17, s21 :: v_dual_mov_b32 v18, s22
-; GFX1250-NEXT: s_lshr_b32 s54, s2, 28
-; GFX1250-NEXT: s_lshr_b32 s58, s2, 29
+; GFX1250-NEXT: s_lshr_b32 s54, s10, 28
+; GFX1250-NEXT: s_lshr_b32 s58, s10, 29
; GFX1250-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
; GFX1250-NEXT: v_dual_mov_b32 v19, s23 :: v_dual_mov_b32 v20, s24
; GFX1250-NEXT: s_wait_xcnt 0x2
; GFX1250-NEXT: v_dual_mov_b32 v21, s25 :: v_dual_mov_b32 v22, s28
-; GFX1250-NEXT: s_lshr_b32 s60, s2, 26
-; GFX1250-NEXT: s_lshr_b32 s64, s2, 27
+; GFX1250-NEXT: s_lshr_b32 s60, s10, 26
+; GFX1250-NEXT: s_lshr_b32 s64, s10, 27
; GFX1250-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000
; GFX1250-NEXT: v_dual_mov_b32 v23, s29 :: v_dual_mov_b32 v24, s34
@@ -11398,24 +11339,24 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX1250-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39
; GFX1250-NEXT: s_wait_xcnt 0x4
; GFX1250-NEXT: v_mov_b32_e32 v4, s46
-; GFX1250-NEXT: s_lshr_b32 s68, s2, 24
-; GFX1250-NEXT: s_lshr_b32 s70, s2, 25
-; GFX1250-NEXT: s_lshr_b32 s72, s2, 22
-; GFX1250-NEXT: s_lshr_b32 s76, s2, 23
+; GFX1250-NEXT: s_lshr_b32 s68, s10, 24
+; GFX1250-NEXT: s_lshr_b32 s70, s10, 25
+; GFX1250-NEXT: s_lshr_b32 s72, s10, 22
+; GFX1250-NEXT: s_lshr_b32 s76, s10, 23
; GFX1250-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000
; GFX1250-NEXT: v_dual_mov_b32 v5, s47 :: v_dual_mov_b32 v6, s42
; GFX1250-NEXT: s_wait_xcnt 0x3
; GFX1250-NEXT: v_dual_mov_b32 v7, s43 :: v_dual_mov_b32 v10, s48
-; GFX1250-NEXT: s_lshr_b32 s80, s2, 20
-; GFX1250-NEXT: s_lshr_b32 s82, s2, 21
+; GFX1250-NEXT: s_lshr_b32 s80, s10, 20
+; GFX1250-NEXT: s_lshr_b32 s82, s10, 21
; GFX1250-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000
; GFX1250-NEXT: v_dual_mov_b32 v11, s49 :: v_dual_mov_b32 v12, s50
; GFX1250-NEXT: s_wait_xcnt 0x2
; GFX1250-NEXT: v_dual_mov_b32 v13, s51 :: v_dual_mov_b32 v14, s54
-; GFX1250-NEXT: s_lshr_b32 s84, s2, 18
-; GFX1250-NEXT: s_lshr_b32 s86, s2, 19
+; GFX1250-NEXT: s_lshr_b32 s84, s10, 18
+; GFX1250-NEXT: s_lshr_b32 s86, s10, 19
; GFX1250-NEXT: s_bfe_i64 s[76:77], s[76:77], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000
@@ -11423,22 +11364,22 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX1250-NEXT: v_dual_mov_b32 v15, s55 :: v_dual_mov_b32 v16, s58
; GFX1250-NEXT: s_wait_xcnt 0x1
; GFX1250-NEXT: v_dual_mov_b32 v17, s59 :: v_dual_mov_b32 v18, s60
-; GFX1250-NEXT: s_lshr_b32 s90, s2, 16
-; GFX1250-NEXT: s_lshr_b32 s98, s2, 17
+; GFX1250-NEXT: s_lshr_b32 s90, s10, 16
+; GFX1250-NEXT: s_lshr_b32 s98, s10, 17
; GFX1250-NEXT: s_bfe_i64 s[82:83], s[82:83], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000
; GFX1250-NEXT: v_dual_mov_b32 v19, s61 :: v_dual_mov_b32 v20, s64
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v21, s65 :: v_dual_mov_b32 v22, s68
-; GFX1250-NEXT: s_lshr_b32 s96, s2, 14
-; GFX1250-NEXT: s_lshr_b32 s100, s2, 15
-; GFX1250-NEXT: s_lshr_b32 s94, s2, 13
-; GFX1250-NEXT: s_lshr_b32 s88, s2, 11
-; GFX1250-NEXT: s_lshr_b32 s74, s2, 9
-; GFX1250-NEXT: s_lshr_b32 s62, s2, 7
-; GFX1250-NEXT: s_lshr_b32 s52, s2, 5
-; GFX1250-NEXT: s_lshr_b32 s40, s2, 3
-; GFX1250-NEXT: s_lshr_b32 s26, s2, 1
+; GFX1250-NEXT: s_lshr_b32 s96, s10, 14
+; GFX1250-NEXT: s_lshr_b32 s100, s10, 15
+; GFX1250-NEXT: s_lshr_b32 s94, s10, 13
+; GFX1250-NEXT: s_lshr_b32 s88, s10, 11
+; GFX1250-NEXT: s_lshr_b32 s74, s10, 9
+; GFX1250-NEXT: s_lshr_b32 s62, s10, 7
+; GFX1250-NEXT: s_lshr_b32 s52, s10, 5
+; GFX1250-NEXT: s_lshr_b32 s40, s10, 3
+; GFX1250-NEXT: s_lshr_b32 s26, s10, 1
; GFX1250-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[84:85], s[84:85], 0x10000
; GFX1250-NEXT: v_dual_mov_b32 v23, s69 :: v_dual_mov_b32 v24, s70
@@ -11455,19 +11396,19 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX1250-NEXT: v_dual_mov_b32 v2, s76 :: v_dual_mov_b32 v3, s77
; GFX1250-NEXT: s_wait_xcnt 0x4
; GFX1250-NEXT: v_mov_b32_e32 v4, s80
-; GFX1250-NEXT: s_lshr_b32 s92, s2, 12
-; GFX1250-NEXT: s_lshr_b32 s78, s2, 10
+; GFX1250-NEXT: s_lshr_b32 s92, s10, 12
+; GFX1250-NEXT: s_lshr_b32 s78, s10, 10
; GFX1250-NEXT: s_bfe_i64 s[98:99], s[98:99], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[90:91], s[90:91], 0x10000
; GFX1250-NEXT: v_dual_mov_b32 v5, s81 :: v_dual_mov_b32 v6, s82
; GFX1250-NEXT: s_wait_xcnt 0x3
; GFX1250-NEXT: v_dual_mov_b32 v7, s83 :: v_dual_mov_b32 v10, s84
-; GFX1250-NEXT: s_lshr_b32 s66, s2, 8
-; GFX1250-NEXT: s_lshr_b32 s56, s2, 6
-; GFX1250-NEXT: s_lshr_b32 s44, s2, 4
-; GFX1250-NEXT: s_lshr_b32 s30, s2, 2
-; GFX1250-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x10000
-; GFX1250-NEXT: s_bfe_i64 s[2:3], s[26:27], 0x10000
+; GFX1250-NEXT: s_lshr_b32 s66, s10, 8
+; GFX1250-NEXT: s_lshr_b32 s56, s10, 6
+; GFX1250-NEXT: s_lshr_b32 s44, s10, 4
+; GFX1250-NEXT: s_lshr_b32 s30, s10, 2
+; GFX1250-NEXT: s_bfe_i64 s[18:19], s[10:11], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[10:11], s[26:27], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[26:27], s[40:41], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[40:41], s[52:53], 0x10000
; GFX1250-NEXT: s_bfe_i64 s[52:53], s[62:63], 0x10000
@@ -11517,8 +11458,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX1250-NEXT: v_dual_mov_b32 v19, s31 :: v_dual_mov_b32 v20, s26
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v21, s27 :: v_dual_mov_b32 v22, s18
-; GFX1250-NEXT: v_dual_mov_b32 v23, s19 :: v_dual_mov_b32 v24, s2
-; GFX1250-NEXT: v_mov_b32_e32 v25, s3
+; GFX1250-NEXT: v_dual_mov_b32 v23, s19 :: v_dual_mov_b32 v24, s10
+; GFX1250-NEXT: v_mov_b32_e32 v25, s11
; GFX1250-NEXT: s_clause 0x5
; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:80
; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:64
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 51a42183f408d..a135b43bad0fe 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -6178,7 +6178,6 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s5
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s4, 16
@@ -6204,7 +6203,6 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 s5, 0
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -6544,28 +6542,26 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s9
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s7
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s6, 16
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s5
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s6, 16
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s5
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 16
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[6:7], 0x100000
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s13, s5, 31
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s5, 16
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[10:11], 0x100000
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s7, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s7, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[12:13], 0x100000
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s12, s7, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s7, 16
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[14:15], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[12:13], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s12
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18
@@ -6590,25 +6586,23 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GCN-HSA-NEXT: s_mov_b32 s3, 0
-; GCN-HSA-NEXT: s_mov_b32 s9, s3
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_mov_b32 s2, s7
-; GCN-HSA-NEXT: s_lshr_b32 s10, s6, 16
-; GCN-HSA-NEXT: s_mov_b32 s8, s5
+; GCN-HSA-NEXT: s_lshr_b32 s8, s6, 16
+; GCN-HSA-NEXT: s_mov_b32 s10, s5
; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 16
-; GCN-HSA-NEXT: s_ashr_i32 s11, s5, 16
+; GCN-HSA-NEXT: s_ashr_i32 s13, s5, 16
; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x100000
; GCN-HSA-NEXT: s_ashr_i32 s6, s5, 31
-; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[8:9], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[10:11], 0x100000
-; GCN-HSA-NEXT: s_ashr_i32 s10, s7, 31
+; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[12:13], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
+; GCN-HSA-NEXT: s_ashr_i32 s12, s7, 31
; GCN-HSA-NEXT: s_ashr_i32 s7, s7, 16
; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
@@ -6619,7 +6613,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s10
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s12
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
@@ -6632,17 +6626,17 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s6
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
@@ -7170,19 +7164,15 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, s13
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s17, s13
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, s13
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s7
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s6, 16
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, s5
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s4, 16
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s16, s3
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s2, 16
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s1
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s6, 16
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s16, s5
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s4, 16
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s3
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s2, 16
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s1
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s0, 16
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[0:1], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x100000
@@ -7190,40 +7180,40 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x100000
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s1, 31
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s1, 16
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[0:1], s[18:19], 0x100000
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s3, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s3, 16
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[16:17], 0x100000
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s5, 16
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x100000
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s7, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s40, s7, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[0:1], s[24:25], 0x100000
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s24, s3, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s3, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[20:21], 0x100000
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s5, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s5, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[16:17], 0x100000
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s7, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s7, 16
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[12:13], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[24:25], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[22:23], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[20:21], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s40
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s38
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s33
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s27
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s20
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[8:11], 0 offset:80
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s34
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s35
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s2
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s3
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s27
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s25
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s25
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s24
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[8:11], 0 offset:48
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s30
@@ -7236,14 +7226,14 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s28
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s29
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s19
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s18
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s19
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:64
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s17
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s12
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s13
@@ -7258,10 +7248,6 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
-; GCN-HSA-NEXT: s_mov_b32 s11, 0
-; GCN-HSA-NEXT: s_mov_b32 s15, s11
-; GCN-HSA-NEXT: s_mov_b32 s19, s11
-; GCN-HSA-NEXT: s_mov_b32 s23, s11
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_mov_b32 s10, s7
; GCN-HSA-NEXT: s_lshr_b32 s12, s6, 16
@@ -8323,13 +8309,11 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s53, 0
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, s53
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s52, s15
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s13
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s11
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s9
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s11
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s9
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s1, 31
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s1, 16
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s3, 31
@@ -8339,26 +8323,29 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s7, 31
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s37, s7, 16
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s9, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s56, s9, 16
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s11, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s11, 16
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s59, s13, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s60, s13, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[24:25], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[22:23], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[20:21], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[18:19], 0x100000
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s15, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s62, s15, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s14, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s12, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s10, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s8, 16
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s50, s7
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s6, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s9, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s11, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s47, s11, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s13, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s56, s13, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s15, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s15, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s14, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s12, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s10, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s8, 16
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s46, s7
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s6, 16
; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s5
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s4, 16
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s46, s3
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s2, 16
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s48, s1
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s0, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s4, 16
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s3
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s2, 16
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s1
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s0, 16
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
@@ -8366,119 +8353,110 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[14:15], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[52:53], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s41, s53
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s43, s53
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s51, s53
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s45, s53
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s47, s53
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s49, s53
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s14
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s15
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s54
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s55
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s13
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s54
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s55
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s52
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s53
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s2
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s50
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s51
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s48
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s49
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s58
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s57
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s56
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s41
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[40:41], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[42:43], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[50:51], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[48:49], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[46:47], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s12
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s13
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s62
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s61
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s60
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s59
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s58
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s57
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s10
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s14
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[46:47], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[44:45], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[42:43], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s47
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s45
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s43
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s39
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:240
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s11
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s15
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:208
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:176
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s56
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s39
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s37
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s35
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s44
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s45
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s33
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s31
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s42
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s43
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s29
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s27
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:208
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s33
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s31
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s16
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s17
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s29
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[38:39], 0x100000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s27
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s40
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s41
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s25
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s23
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s8
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s9
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[38:39], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[36:37], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[34:35], 0x100000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s10
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s11
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s9
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[36:37], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[34:35], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[30:31], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[28:29], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s6
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s7
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s4
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s5
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s20
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s6
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s7
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s4
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s5
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s22
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s23
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s26
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s27
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s16
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s17
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:128
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s14
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s15
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s12
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s13
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s10
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s11
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s18
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s19
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s26
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s27
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s17
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s14
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s15
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s9
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64:
@@ -8487,97 +8465,89 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-HSA-NEXT: s_mov_b32 s25, 0
-; GCN-HSA-NEXT: s_mov_b32 s51, s25
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
-; GCN-HSA-NEXT: s_mov_b32 s55, s25
-; GCN-HSA-NEXT: s_mov_b32 s31, s25
-; GCN-HSA-NEXT: s_mov_b32 s57, s25
-; GCN-HSA-NEXT: s_mov_b32 s61, s25
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_mov_b32 s24, s15
-; GCN-HSA-NEXT: s_ashr_i32 s39, s3, 31
-; GCN-HSA-NEXT: s_ashr_i32 s40, s3, 16
-; GCN-HSA-NEXT: s_ashr_i32 s59, s11, 16
-; GCN-HSA-NEXT: s_ashr_i32 s63, s13, 31
-; GCN-HSA-NEXT: s_ashr_i32 s67, s13, 16
-; GCN-HSA-NEXT: s_ashr_i32 s71, s15, 31
-; GCN-HSA-NEXT: s_lshr_b32 s48, s14, 16
-; GCN-HSA-NEXT: s_mov_b32 s50, s13
-; GCN-HSA-NEXT: s_lshr_b32 s52, s12, 16
-; GCN-HSA-NEXT: s_mov_b32 s54, s11
+; GCN-HSA-NEXT: s_ashr_i32 s37, s3, 31
+; GCN-HSA-NEXT: s_ashr_i32 s38, s3, 16
+; GCN-HSA-NEXT: s_ashr_i32 s57, s11, 16
+; GCN-HSA-NEXT: s_ashr_i32 s59, s13, 31
+; GCN-HSA-NEXT: s_ashr_i32 s61, s13, 16
+; GCN-HSA-NEXT: s_ashr_i32 s63, s15, 31
+; GCN-HSA-NEXT: s_ashr_i32 s65, s15, 16
+; GCN-HSA-NEXT: s_lshr_b32 s46, s14, 16
+; GCN-HSA-NEXT: s_mov_b32 s48, s13
+; GCN-HSA-NEXT: s_lshr_b32 s50, s12, 16
+; GCN-HSA-NEXT: s_mov_b32 s52, s11
; GCN-HSA-NEXT: s_lshr_b32 s34, s10, 16
; GCN-HSA-NEXT: s_mov_b32 s30, s9
; GCN-HSA-NEXT: s_lshr_b32 s28, s8, 16
-; GCN-HSA-NEXT: s_mov_b32 s56, s7
-; GCN-HSA-NEXT: s_lshr_b32 s58, s6, 16
-; GCN-HSA-NEXT: s_mov_b32 s60, s5
-; GCN-HSA-NEXT: s_lshr_b32 s62, s4, 16
-; GCN-HSA-NEXT: s_mov_b32 s64, s3
-; GCN-HSA-NEXT: s_lshr_b32 s66, s2, 16
-; GCN-HSA-NEXT: s_mov_b32 s68, s1
-; GCN-HSA-NEXT: s_lshr_b32 s70, s0, 16
+; GCN-HSA-NEXT: s_mov_b32 s54, s7
+; GCN-HSA-NEXT: s_lshr_b32 s56, s6, 16
+; GCN-HSA-NEXT: s_mov_b32 s58, s5
+; GCN-HSA-NEXT: s_lshr_b32 s60, s4, 16
+; GCN-HSA-NEXT: s_mov_b32 s62, s3
+; GCN-HSA-NEXT: s_lshr_b32 s64, s2, 16
+; GCN-HSA-NEXT: s_mov_b32 s66, s1
+; GCN-HSA-NEXT: s_lshr_b32 s68, s0, 16
; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x100000
-; GCN-HSA-NEXT: s_mov_b32 s65, s25
-; GCN-HSA-NEXT: s_mov_b32 s69, s25
; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[24:25], 0x100000
; GCN-HSA-NEXT: s_ashr_i32 s33, s1, 31
-; GCN-HSA-NEXT: s_ashr_i32 s38, s1, 16
-; GCN-HSA-NEXT: s_ashr_i32 s41, s5, 31
-; GCN-HSA-NEXT: s_ashr_i32 s42, s5, 16
-; GCN-HSA-NEXT: s_ashr_i32 s43, s7, 31
-; GCN-HSA-NEXT: s_ashr_i32 s44, s7, 16
-; GCN-HSA-NEXT: s_ashr_i32 s45, s9, 31
-; GCN-HSA-NEXT: s_ashr_i32 s46, s9, 16
-; GCN-HSA-NEXT: s_ashr_i32 s47, s11, 31
-; GCN-HSA-NEXT: s_ashr_i32 s76, s15, 16
+; GCN-HSA-NEXT: s_ashr_i32 s36, s1, 16
+; GCN-HSA-NEXT: s_ashr_i32 s39, s5, 31
+; GCN-HSA-NEXT: s_ashr_i32 s40, s5, 16
+; GCN-HSA-NEXT: s_ashr_i32 s41, s7, 31
+; GCN-HSA-NEXT: s_ashr_i32 s42, s7, 16
+; GCN-HSA-NEXT: s_ashr_i32 s43, s9, 31
+; GCN-HSA-NEXT: s_ashr_i32 s44, s9, 16
+; GCN-HSA-NEXT: s_ashr_i32 s45, s11, 31
; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[4:5], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[36:37], s[10:11], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[70:71], s[10:11], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[72:73], s[12:13], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[74:75], s[14:15], 0x100000
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[70:71], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[68:69], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[64:65], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[62:63], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[60:61], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[58:59], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[56:57], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[62:63], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[56:57], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[54:55], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000
-; GCN-HSA-NEXT: s_add_u32 s56, s16, 0xf0
-; GCN-HSA-NEXT: s_addc_u32 s57, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s48
-; GCN-HSA-NEXT: s_add_u32 s48, s16, 0xe0
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, s49
-; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s48
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s49
-; GCN-HSA-NEXT: s_add_u32 s48, s16, 0xd0
-; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, s48
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s49
-; GCN-HSA-NEXT: s_add_u32 s48, s16, 0xc0
-; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v18, s56
-; GCN-HSA-NEXT: v_mov_b32_e32 v20, s36
-; GCN-HSA-NEXT: s_add_u32 s36, s16, 0xb0
-; GCN-HSA-NEXT: v_mov_b32_e32 v19, s57
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, s37
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s76
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s71
-; GCN-HSA-NEXT: s_addc_u32 s37, s17, 0
+; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000
+; GCN-HSA-NEXT: s_add_u32 s54, s16, 0xf0
+; GCN-HSA-NEXT: s_addc_u32 s55, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s46
+; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xe0
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47
+; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s46
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s47
+; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xd0
+; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s46
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, s47
+; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xc0
+; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46
+; GCN-HSA-NEXT: v_mov_b32_e32 v18, s54
+; GCN-HSA-NEXT: v_mov_b32_e32 v29, s47
+; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xb0
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, s55
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s65
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s63
+; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s74
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30
@@ -8586,7 +8556,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31
; GCN-HSA-NEXT: s_addc_u32 s31, s17, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s50
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s48
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s26
; GCN-HSA-NEXT: s_add_u32 s26, s16, 0x90
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s27
@@ -8594,38 +8564,38 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v24, s26
; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27
; GCN-HSA-NEXT: s_add_u32 s26, s16, 0x80
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s51
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s67
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s49
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s61
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s59
; GCN-HSA-NEXT: s_addc_u32 s27, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v28, s48
; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s72
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24
; GCN-HSA-NEXT: s_add_u32 s24, s16, 0x70
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s73
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s52
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s53
-; GCN-HSA-NEXT: v_mov_b32_e32 v29, s49
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s50
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s51
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25
; GCN-HSA-NEXT: s_addc_u32 s25, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v30, s36
+; GCN-HSA-NEXT: v_mov_b32_e32 v30, s46
; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15]
-; GCN-HSA-NEXT: v_mov_b32_e32 v16, s54
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, s52
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14
; GCN-HSA-NEXT: s_add_u32 s14, s16, 0x60
-; GCN-HSA-NEXT: v_mov_b32_e32 v17, s55
-; GCN-HSA-NEXT: v_mov_b32_e32 v31, s37
-; GCN-HSA-NEXT: v_mov_b32_e32 v18, s59
-; GCN-HSA-NEXT: v_mov_b32_e32 v19, s47
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s53
+; GCN-HSA-NEXT: v_mov_b32_e32 v31, s47
+; GCN-HSA-NEXT: v_mov_b32_e32 v18, s57
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, s45
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15
; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, s70
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s71
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s34
; GCN-HSA-NEXT: v_mov_b32_e32 v23, s35
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s31
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s46
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s45
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s44
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s43
; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26
; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19]
; GCN-HSA-NEXT: v_mov_b32_e32 v27, s27
@@ -8635,8 +8605,8 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s29
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23]
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s22
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s44
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s43
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s42
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s23
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s25
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s14
@@ -8649,8 +8619,8 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13
; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s42
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s41
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s40
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_nop 0
@@ -8669,8 +8639,8 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9
; GCN-HSA-NEXT: s_addc_u32 s9, s17, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s40
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_nop 0
@@ -8689,7 +8659,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 125105bb92e7b..6f63384be90fd 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -6395,43 +6395,40 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0
; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
-; GFX6-NOHSA-NEXT: s_mov_b32 s9, s7
-; GFX6-NOHSA-NEXT: s_mov_b32 s11, s7
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s5, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s4, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s4, 24
-; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s4, 8
-; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s5, 8
-; GFX6-NOHSA-NEXT: s_mov_b32 s10, s5
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
+; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s4, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s4, 24
+; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s4, 8
+; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s5, 8
+; GFX6-NOHSA-NEXT: s_mov_b32 s16, s5
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[4:5], 0x80000
-; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s5, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s15, s5, 31
; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s5, 24
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[8:9], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[16:17], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s15
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s18
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s19
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s10
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s11
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s16
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s17
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s12
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s13
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s14
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s15
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s10
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s11
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s8
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s9
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s12
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s13
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s4
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s5
@@ -6442,28 +6439,25 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 s5, 0
; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-HSA-NEXT: s_mov_b32 s7, s5
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX7-HSA-NEXT: s_mov_b32 s9, s5
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_lshr_b32 s4, s3, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s10, s2, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s12, s2, 24
-; GFX7-HSA-NEXT: s_lshr_b32 s14, s2, 8
-; GFX7-HSA-NEXT: s_lshr_b32 s6, s3, 8
-; GFX7-HSA-NEXT: s_mov_b32 s8, s3
-; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GFX7-HSA-NEXT: s_lshr_b32 s6, s2, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s8, s2, 24
+; GFX7-HSA-NEXT: s_lshr_b32 s10, s2, 8
+; GFX7-HSA-NEXT: s_lshr_b32 s12, s3, 8
+; GFX7-HSA-NEXT: s_mov_b32 s14, s3
+; GFX7-HSA-NEXT: s_ashr_i32 s5, s3, 31
; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GFX7-HSA-NEXT: s_ashr_i32 s18, s3, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s19, s3, 24
+; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GFX7-HSA-NEXT: s_ashr_i32 s18, s3, 24
; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48
@@ -6472,15 +6466,15 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s19
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s11
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s9
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
@@ -6488,15 +6482,15 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11
; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_endpgm
@@ -6504,25 +6498,23 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i64:
; GFX8-NOHSA: ; %bb.0:
; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NOHSA-NEXT: s_mov_b32 s5, 0
-; GFX8-NOHSA-NEXT: s_mov_b32 s7, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s3, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s2, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s2, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 8
-; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s3, 8
-; GFX8-NOHSA-NEXT: s_mov_b32 s6, s3
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s2, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s2, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s2, 8
+; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s3, 8
+; GFX8-NOHSA-NEXT: s_mov_b32 s14, s3
+; GFX8-NOHSA-NEXT: s_ashr_i32 s5, s3, 31
; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GFX8-NOHSA-NEXT: s_ashr_i32 s18, s3, 31
-; GFX8-NOHSA-NEXT: s_ashr_i32 s19, s3, 24
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GFX8-NOHSA-NEXT: s_ashr_i32 s18, s3, 24
; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48
@@ -6531,15 +6523,15 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s19
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s18
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s9
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
@@ -6547,15 +6539,15 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11
; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
@@ -6619,36 +6611,33 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX12-LABEL: constant_sextload_v8i8_to_v8i64:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s7, s5
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshr_b32 s4, s3, 16
-; GFX12-NEXT: s_lshr_b32 s8, s2, 16
-; GFX12-NEXT: s_lshr_b32 s10, s2, 24
-; GFX12-NEXT: s_lshr_b32 s12, s2, 8
-; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
+; GFX12-NEXT: s_lshr_b32 s6, s2, 16
+; GFX12-NEXT: s_lshr_b32 s8, s2, 24
+; GFX12-NEXT: s_lshr_b32 s10, s2, 8
; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
-; GFX12-NEXT: s_lshr_b32 s14, s3, 8
-; GFX12-NEXT: s_mov_b32 s6, s3
+; GFX12-NEXT: s_lshr_b32 s12, s3, 8
+; GFX12-NEXT: s_mov_b32 s14, s3
; GFX12-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000
; GFX12-NEXT: s_ashr_i32 s15, s3, 31
; GFX12-NEXT: s_ashr_i32 s18, s3, 24
-; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s15
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v9, s9
-; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s11
-; GFX12-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v7, s13
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[6:7], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[6:7], s[14:15], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v9, s7
+; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s9
+; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v7, s11
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[14:15], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
; GFX12-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v5, s17
; GFX12-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v13, s3
-; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v15, s7
-; GFX12-NEXT: v_mov_b32_e32 v14, s6
+; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v13, s3
+; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v15, s13
+; GFX12-NEXT: v_mov_b32_e32 v14, s12
; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1]
@@ -7042,78 +7031,72 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-NOHSA-NEXT: s_mov_b32 s13, 0
; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
-; GFX6-NOHSA-NEXT: s_mov_b32 s11, s13
-; GFX6-NOHSA-NEXT: s_mov_b32 s15, s13
-; GFX6-NOHSA-NEXT: s_mov_b32 s17, s13
-; GFX6-NOHSA-NEXT: s_mov_b32 s19, s13
-; GFX6-NOHSA-NEXT: s_mov_b32 s9, s13
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s6, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s6, 24
-; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s6, 8
-; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s4, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s4, 24
-; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s4, 8
-; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s7, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 24
+; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 8
+; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s4, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 24
+; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 8
+; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s7, 16
; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 8
-; GFX6-NOHSA-NEXT: s_mov_b32 s14, s7
-; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s5, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s5, 8
+; GFX6-NOHSA-NEXT: s_mov_b32 s26, s7
+; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s5, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s5, 8
; GFX6-NOHSA-NEXT: s_mov_b32 s8, s5
; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
; GFX6-NOHSA-NEXT: s_ashr_i32 s29, s5, 31
; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s5, 24
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s7, 31
; GFX6-NOHSA-NEXT: s_ashr_i32 s38, s7, 24
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[18:19], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[16:17], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[30:31], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[28:29], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s36
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s34
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s35
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s38
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s33
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s14
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s15
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s20
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s21
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s22
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s23
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s26
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s27
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s12
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s13
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s14
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s15
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s31
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s29
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s24
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s25
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s16
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s26
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s27
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s18
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s19
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s18
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s19
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s16
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s17
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s22
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s23
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s12
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s13
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s24
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s25
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s10
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s11
@@ -7131,15 +7114,10 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 s19, 0
; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-HSA-NEXT: s_mov_b32 s21, s19
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX7-HSA-NEXT: s_mov_b32 s23, s19
-; GFX7-HSA-NEXT: s_mov_b32 s25, s19
-; GFX7-HSA-NEXT: s_mov_b32 s27, s19
-; GFX7-HSA-NEXT: s_mov_b32 s29, s19
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_lshr_b32 s2, s6, 16
; GFX7-HSA-NEXT: s_lshr_b32 s8, s6, 24
@@ -7150,6 +7128,8 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_lshr_b32 s16, s4, 8
; GFX7-HSA-NEXT: s_lshr_b32 s18, s7, 16
; GFX7-HSA-NEXT: s_lshr_b32 s20, s7, 8
+; GFX7-HSA-NEXT: s_ashr_i32 s27, s5, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s29, s5, 24
; GFX7-HSA-NEXT: s_mov_b32 s22, s7
; GFX7-HSA-NEXT: s_lshr_b32 s24, s5, 16
; GFX7-HSA-NEXT: s_lshr_b32 s26, s5, 8
@@ -7157,11 +7137,8 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x80000
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_ashr_i32 s33, s5, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s34, s5, 24
-; GFX7-HSA-NEXT: s_ashr_i32 s35, s7, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s36, s7, 24
+; GFX7-HSA-NEXT: s_ashr_i32 s33, s7, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s34, s7, 24
; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2
@@ -7211,8 +7188,8 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s36
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s34
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x60
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -7230,8 +7207,8 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s34
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s29
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s27
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -7247,38 +7224,34 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i64:
; GFX8-NOHSA: ; %bb.0:
; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NOHSA-NEXT: s_mov_b32 s11, 0
-; GFX8-NOHSA-NEXT: s_mov_b32 s27, s11
-; GFX8-NOHSA-NEXT: s_mov_b32 s29, s11
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: s_mov_b32 s3, s11
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s6, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s6, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s6, 8
-; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s4, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s4, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s22, s4, 8
-; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s7, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s7, 8
-; GFX8-NOHSA-NEXT: s_mov_b32 s26, s7
-; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s5, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s5, 8
-; GFX8-NOHSA-NEXT: s_mov_b32 s2, s5
-; GFX8-NOHSA-NEXT: s_ashr_i32 s23, s5, 31
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000
+; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s10, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s10, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s10, 8
+; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s8, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s8, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s22, s8, 8
+; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s11, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s11, 8
+; GFX8-NOHSA-NEXT: s_mov_b32 s28, s11
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s9, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s9, 8
+; GFX8-NOHSA-NEXT: s_mov_b32 s2, s9
+; GFX8-NOHSA-NEXT: s_ashr_i32 s25, s9, 31
+; GFX8-NOHSA-NEXT: s_ashr_i32 s29, s9, 24
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[8:9], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[10:11], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s5, 24
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[28:29], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
-; GFX8-NOHSA-NEXT: s_ashr_i32 s28, s7, 31
-; GFX8-NOHSA-NEXT: s_ashr_i32 s29, s7, 24
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[10:11], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[22:23], 0x80000
+; GFX8-NOHSA-NEXT: s_ashr_i32 s28, s11, 31
+; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s11, 24
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[24:25], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
@@ -7314,46 +7287,47 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s31
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s23
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x70
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11
+; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s33
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s28
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: s_nop 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 0x60
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s26
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: s_nop 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x70
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 48
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s29
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s28
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x60
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s26
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s27
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s25
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: s_nop 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 48
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s33
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s23
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s9
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
@@ -7462,69 +7436,63 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
-; GFX12-NEXT: s_mov_b32 s3, 0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s13, s3
-; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s14, s6, 16
-; GFX12-NEXT: s_lshr_b32 s16, s6, 24
-; GFX12-NEXT: s_lshr_b32 s18, s6, 8
+; GFX12-NEXT: s_lshr_b32 s2, s6, 16
+; GFX12-NEXT: s_lshr_b32 s8, s6, 24
+; GFX12-NEXT: s_lshr_b32 s10, s6, 8
+; GFX12-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
+; GFX12-NEXT: s_lshr_b32 s12, s4, 16
+; GFX12-NEXT: s_lshr_b32 s14, s4, 24
; GFX12-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GFX12-NEXT: s_lshr_b32 s20, s4, 16
-; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s35
-; GFX12-NEXT: s_lshr_b32 s22, s4, 24
-; GFX12-NEXT: v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v5, s15
-; GFX12-NEXT: v_dual_mov_b32 v4, s14 :: v_dual_mov_b32 v7, s17
-; GFX12-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v3, s19
-; GFX12-NEXT: s_lshr_b32 s24, s4, 8
-; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v9, s21
-; GFX12-NEXT: s_lshr_b32 s2, s7, 16
-; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
-; GFX12-NEXT: s_lshr_b32 s26, s7, 8
-; GFX12-NEXT: s_mov_b32 s8, s7
-; GFX12-NEXT: s_mov_b32 s12, s5
-; GFX12-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000
-; GFX12-NEXT: s_ashr_i32 s27, s5, 31
-; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
-; GFX12-NEXT: s_lshr_b32 s10, s5, 16
+; GFX12-NEXT: s_lshr_b32 s16, s4, 8
+; GFX12-NEXT: v_dual_mov_b32 v4, s30 :: v_dual_mov_b32 v9, s3
+; GFX12-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v11, s9
+; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v3, s11
+; GFX12-NEXT: s_lshr_b32 s18, s7, 16
+; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v5, s31
+; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v13, s13
+; GFX12-NEXT: s_lshr_b32 s20, s7, 8
+; GFX12-NEXT: s_mov_b32 s22, s7
+; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
+; GFX12-NEXT: s_lshr_b32 s24, s5, 16
; GFX12-NEXT: s_ashr_i32 s33, s7, 31
; GFX12-NEXT: s_ashr_i32 s36, s7, 24
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
-; GFX12-NEXT: v_mov_b32_e32 v10, s22
-; GFX12-NEXT: s_lshr_b32 s28, s5, 8
+; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v15, s15
+; GFX12-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v7, s17
+; GFX12-NEXT: s_lshr_b32 s26, s5, 8
+; GFX12-NEXT: s_mov_b32 s28, s5
+; GFX12-NEXT: s_ashr_i32 s27, s5, 31
; GFX12-NEXT: s_ashr_i32 s29, s5, 24
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[12:13], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
+; GFX12-NEXT: v_mov_b32_e32 v6, s16
+; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:80
; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:64
-; GFX12-NEXT: v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v3, s25
-; GFX12-NEXT: v_dual_mov_b32 v1, s31 :: v_dual_mov_b32 v2, s24
-; GFX12-NEXT: v_mov_b32_e32 v5, s3
-; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v7, s33
-; GFX12-NEXT: v_dual_mov_b32 v6, s36 :: v_dual_mov_b32 v13, s9
-; GFX12-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v12, s8 :: v_dual_mov_b32 v15, s13
-; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v17, s11
-; GFX12-NEXT: v_dual_mov_b32 v16, s10 :: v_dual_mov_b32 v19, s27
+; GFX12-NEXT: v_dual_mov_b32 v0, s18 :: v_dual_mov_b32 v3, s33
+; GFX12-NEXT: v_dual_mov_b32 v1, s19 :: v_dual_mov_b32 v2, s36
+; GFX12-NEXT: v_mov_b32_e32 v9, s23
+; GFX12-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[26:27], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v11, s21
+; GFX12-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v17, s25
+; GFX12-NEXT: v_dual_mov_b32 v16, s24 :: v_dual_mov_b32 v19, s27
; GFX12-NEXT: v_dual_mov_b32 v18, s29 :: v_dual_mov_b32 v21, s5
; GFX12-NEXT: v_dual_mov_b32 v20, s4 :: v_dual_mov_b32 v23, s7
; GFX12-NEXT: v_mov_b32_e32 v22, s6
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1]
-; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:112
+; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:96
; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:48
; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] offset:32
; GFX12-NEXT: s_endpgm
@@ -8237,42 +8205,40 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
-; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0
-; GFX6-NOHSA-NEXT: s_mov_b32 s29, s11
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s6, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s6, 24
+; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s6, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s6, 24
; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s6, 8
; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s4, 16
; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 24
; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s4, 8
-; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s2, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s2, 24
-; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s2, 8
-; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s0, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 16
-; GFX6-NOHSA-NEXT: s_mov_b32 s28, s7
-; GFX6-NOHSA-NEXT: s_ashr_i32 s13, s1, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s15, s1, 24
-; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s3, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s3, 24
-; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s5, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s23, s5, 24
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[48:49], s[28:29], 0x80000
-; GFX6-NOHSA-NEXT: s_ashr_i32 s25, s7, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s27, s7, 24
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[34:35], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[54:55], s[30:31], 0x80000
-; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s0, 24
-; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s0, 8
+; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s2, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s2, 24
+; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s2, 8
+; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s0, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s0, 24
+; GFX6-NOHSA-NEXT: s_mov_b32 s34, s7
+; GFX6-NOHSA-NEXT: s_ashr_i32 s11, s1, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s13, s1, 24
+; GFX6-NOHSA-NEXT: s_ashr_i32 s15, s3, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s3, 24
+; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s5, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s49, s5, 24
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[34:35], 0x80000
+; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s7, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s7, 24
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[30:31], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[54:55], s[28:29], 0x80000
+; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s0, 8
+; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s7, 16
; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s7, 8
; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s5, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s5, 8
-; GFX6-NOHSA-NEXT: s_mov_b32 s44, s5
-; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s3, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s3, 8
+; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s5, 8
+; GFX6-NOHSA-NEXT: s_mov_b32 s46, s5
+; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s3, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s3, 8
; GFX6-NOHSA-NEXT: s_mov_b32 s50, s3
-; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s1, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s1, 16
; GFX6-NOHSA-NEXT: s_bfe_i64 s[56:57], s[0:1], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[4:5], 0x80000
@@ -8289,107 +8255,97 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s3
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s56
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s57
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s48
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s49
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s38
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s39
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s54
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s55
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s52
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s53
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s27
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s25
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s21
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s19
; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x80000
; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s8
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:208
; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[20:21], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[48:49], s[22:23], 0x80000
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s26
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s27
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s9
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s48
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s49
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s22
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s23
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s24
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s25
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:144
-; GFX6-NOHSA-NEXT: s_mov_b32 s35, s11
-; GFX6-NOHSA-NEXT: s_mov_b32 s37, s11
-; GFX6-NOHSA-NEXT: s_mov_b32 s39, s11
-; GFX6-NOHSA-NEXT: s_mov_b32 s45, s11
-; GFX6-NOHSA-NEXT: s_mov_b32 s41, s11
-; GFX6-NOHSA-NEXT: s_mov_b32 s43, s11
-; GFX6-NOHSA-NEXT: s_mov_b32 s51, s11
-; GFX6-NOHSA-NEXT: s_mov_b32 s47, s11
-; GFX6-NOHSA-NEXT: s_mov_b32 s5, s11
-; GFX6-NOHSA-NEXT: s_mov_b32 s7, s11
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[50:51], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[6:7], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[50:51], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[46:47], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[46:47], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[42:43], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[44:45], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[42:43], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[40:41], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[18:19], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[46:47], s[16:17], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[48:49], s[14:15], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[50:51], s[12:13], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[16:17], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[14:15], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[12:13], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[46:47], s[10:11], 0x80000
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s50
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s51
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s48
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s49
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s46
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s47
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s44
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s45
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s23
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s46
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s47
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s49
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s33
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s42
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s43
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s42
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s43
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s28
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s29
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s40
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s41
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s18
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s19
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s44
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s45
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s30
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s31
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s24
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s25
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s28
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s29
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s19
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s17
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s10
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s11
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s17
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s15
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s30
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s31
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:240
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s26
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s27
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s22
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s23
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s34
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s35
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s15
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s13
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s13
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s11
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s36
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s24
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s25
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s20
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s21
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s38
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s39
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:160
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s40
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s41
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s26
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s27
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s8
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s9
@@ -8408,177 +8364,165 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-HSA-NEXT: s_mov_b32 s27, 0
-; GFX7-HSA-NEXT: s_mov_b32 s23, s27
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
-; GFX7-HSA-NEXT: s_mov_b32 s25, s27
-; GFX7-HSA-NEXT: s_mov_b32 s21, s27
-; GFX7-HSA-NEXT: s_mov_b32 s15, s27
-; GFX7-HSA-NEXT: s_mov_b32 s17, s27
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_lshr_b32 s12, s6, 16
; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 24
-; GFX7-HSA-NEXT: s_ashr_i32 s29, s1, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s33, s1, 24
-; GFX7-HSA-NEXT: s_lshr_b32 s40, s0, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s38, s0, 24
-; GFX7-HSA-NEXT: s_lshr_b32 s30, s0, 8
-; GFX7-HSA-NEXT: s_lshr_b32 s66, s1, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s68, s1, 8
-; GFX7-HSA-NEXT: s_mov_b32 s70, s1
-; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x80000
+; GFX7-HSA-NEXT: s_ashr_i32 s33, s1, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s37, s1, 24
+; GFX7-HSA-NEXT: s_lshr_b32 s34, s0, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s30, s0, 24
+; GFX7-HSA-NEXT: s_lshr_b32 s28, s0, 8
+; GFX7-HSA-NEXT: s_lshr_b32 s64, s1, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s66, s1, 8
+; GFX7-HSA-NEXT: s_mov_b32 s68, s1
+; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[0:1], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[12:13], 0x80000
-; GFX7-HSA-NEXT: s_lshr_b32 s28, s6, 8
-; GFX7-HSA-NEXT: s_lshr_b32 s34, s4, 16
-; GFX7-HSA-NEXT: s_ashr_i32 s35, s3, 31
-; GFX7-HSA-NEXT: s_lshr_b32 s52, s4, 24
-; GFX7-HSA-NEXT: s_lshr_b32 s54, s4, 8
-; GFX7-HSA-NEXT: s_lshr_b32 s56, s2, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s58, s2, 24
-; GFX7-HSA-NEXT: s_lshr_b32 s44, s2, 8
+; GFX7-HSA-NEXT: s_lshr_b32 s36, s6, 8
+; GFX7-HSA-NEXT: s_lshr_b32 s40, s4, 16
+; GFX7-HSA-NEXT: s_ashr_i32 s41, s3, 31
+; GFX7-HSA-NEXT: s_lshr_b32 s50, s4, 24
+; GFX7-HSA-NEXT: s_lshr_b32 s52, s4, 8
+; GFX7-HSA-NEXT: s_lshr_b32 s54, s2, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s56, s2, 24
+; GFX7-HSA-NEXT: s_lshr_b32 s42, s2, 8
; GFX7-HSA-NEXT: s_lshr_b32 s26, s7, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s22, s7, 8
+; GFX7-HSA-NEXT: s_lshr_b32 s20, s7, 8
; GFX7-HSA-NEXT: s_mov_b32 s24, s7
-; GFX7-HSA-NEXT: s_lshr_b32 s20, s5, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s18, s5, 16
; GFX7-HSA-NEXT: s_lshr_b32 s14, s5, 8
; GFX7-HSA-NEXT: s_mov_b32 s16, s5
-; GFX7-HSA-NEXT: s_lshr_b32 s60, s3, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s62, s3, 8
-; GFX7-HSA-NEXT: s_mov_b32 s64, s3
+; GFX7-HSA-NEXT: s_lshr_b32 s58, s3, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s60, s3, 8
+; GFX7-HSA-NEXT: s_mov_b32 s62, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1
; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[10:11], 0x80000
-; GFX7-HSA-NEXT: s_mov_b32 s61, s27
-; GFX7-HSA-NEXT: s_mov_b32 s63, s27
-; GFX7-HSA-NEXT: s_mov_b32 s65, s27
-; GFX7-HSA-NEXT: s_mov_b32 s67, s27
-; GFX7-HSA-NEXT: s_mov_b32 s69, s27
-; GFX7-HSA-NEXT: s_mov_b32 s71, s27
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_ashr_i32 s46, s3, 24
-; GFX7-HSA-NEXT: s_ashr_i32 s47, s5, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s48, s5, 24
-; GFX7-HSA-NEXT: s_ashr_i32 s49, s7, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s50, s7, 24
-; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[2:3], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[42:43], s[4:5], 0x80000
+; GFX7-HSA-NEXT: s_ashr_i32 s44, s3, 24
+; GFX7-HSA-NEXT: s_ashr_i32 s45, s5, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s46, s5, 24
+; GFX7-HSA-NEXT: s_ashr_i32 s47, s7, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s48, s7, 24
+; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[2:3], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[70:71], s[4:5], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[72:73], s[6:7], 0x80000
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[70:71], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[68:69], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[64:65], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[62:63], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[60:61], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[66:67], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[62:63], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[60:61], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[34:35], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[62:63], s[28:29], 0x80000
-; GFX7-HSA-NEXT: s_add_u32 s64, s8, 0xd0
-; GFX7-HSA-NEXT: s_addc_u32 s65, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s62
-; GFX7-HSA-NEXT: s_add_u32 s62, s8, 0xc0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s63
+; GFX7-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[40:41], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[36:37], 0x80000
+; GFX7-HSA-NEXT: s_add_u32 s62, s8, 0xd0
; GFX7-HSA-NEXT: s_addc_u32 s63, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s52
-; GFX7-HSA-NEXT: s_add_u32 s52, s8, 0x90
-; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s53
-; GFX7-HSA-NEXT: s_addc_u32 s53, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s42
-; GFX7-HSA-NEXT: s_add_u32 s42, s8, 0x80
-; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s43
-; GFX7-HSA-NEXT: s_addc_u32 s43, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s36
-; GFX7-HSA-NEXT: s_add_u32 s36, s8, 0x50
-; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s37
-; GFX7-HSA-NEXT: s_addc_u32 s37, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s36
-; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s62
-; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s37
-; GFX7-HSA-NEXT: s_add_u32 s36, s8, 64
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s60
+; GFX7-HSA-NEXT: s_add_u32 s60, s8, 0xc0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s61
+; GFX7-HSA-NEXT: s_addc_u32 s61, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s50
+; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0x90
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s51
+; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s50
+; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s51
+; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0x80
+; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62
+; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s38
+; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0x50
+; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s63
+; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s39
+; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s60
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s72
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34
+; GFX7-HSA-NEXT: s_add_u32 s34, s8, 64
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s73
-; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s63
-; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s52
-; GFX7-HSA-NEXT: s_addc_u32 s37, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s60
-; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s61
-; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s53
-; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s56
-; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s57
-; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s58
-; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s59
+; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s61
+; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s38
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35
+; GFX7-HSA-NEXT: s_addc_u32 s35, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s58
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s59
+; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s54
+; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s55
+; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s56
+; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s57
+; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s39
; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18
-; GFX7-HSA-NEXT: s_add_u32 s18, s8, 16
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19
-; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s22
+; GFX7-HSA-NEXT: s_add_u32 s22, s8, 16
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s23
+; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s64
-; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s18
-; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s19
-; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0xf0
-; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s18
-; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s65
-; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s36
-; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s19
-; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0xe0
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s44
-; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s45
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s41
-; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s37
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s38
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s39
-; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s42
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s34
+; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s22
+; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s23
+; GFX7-HSA-NEXT: s_add_u32 s22, s8, 0xf0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s30
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31
+; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s22
+; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s50
+; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s42
+; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s43
+; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s35
+; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s23
+; GFX7-HSA-NEXT: s_add_u32 s22, s8, 0xe0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s19
+; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s70
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8
-; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s54
-; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s55
-; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s43
-; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s30
-; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s31
-; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s18
+; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s71
+; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s52
+; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s53
+; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s51
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s28
+; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s29
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
+; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s22
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9
-; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0xb0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15]
; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26
; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s27
-; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s50
-; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s49
+; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s48
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s47
; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s24
; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s25
-; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s22
-; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s23
+; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s23
+; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s20
+; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s21
; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11]
; GFX7-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18
+; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0xb0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19
; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s48
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s47
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s46
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s45
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
@@ -8597,8 +8541,8 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13
; GFX7-HSA-NEXT: s_addc_u32 s13, s9, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s46
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s44
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
@@ -8617,8 +8561,8 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5
; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s33
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s29
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s37
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
@@ -8636,16 +8580,8 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX8-NOHSA-LABEL: constant_sextload_v32i8_to_v32i64:
; GFX8-NOHSA: ; %bb.0:
; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
-; GFX8-NOHSA-NEXT: s_mov_b32 s63, 0
-; GFX8-NOHSA-NEXT: s_mov_b32 s23, s63
-; GFX8-NOHSA-NEXT: s_mov_b32 s65, s63
-; GFX8-NOHSA-NEXT: s_mov_b32 s19, s63
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
-; GFX8-NOHSA-NEXT: s_mov_b32 s67, s63
-; GFX8-NOHSA-NEXT: s_mov_b32 s15, s63
-; GFX8-NOHSA-NEXT: s_mov_b32 s45, s63
-; GFX8-NOHSA-NEXT: s_mov_b32 s11, s63
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NOHSA-NEXT: s_lshr_b32 s46, s6, 16
; GFX8-NOHSA-NEXT: s_lshr_b32 s48, s6, 24
@@ -8671,6 +8607,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_lshr_b32 s44, s1, 16
; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s1, 8
; GFX8-NOHSA-NEXT: s_mov_b32 s10, s1
+; GFX8-NOHSA-NEXT: s_ashr_i32 s63, s5, 24
; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[0:1], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[38:39], s[4:5], 0x80000
@@ -8688,7 +8625,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
; GFX8-NOHSA-NEXT: s_ashr_i32 s45, s5, 31
-; GFX8-NOHSA-NEXT: s_ashr_i32 s66, s5, 24
; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
@@ -8804,7 +8740,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s66
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s63
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s45
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 0xa0
@@ -9049,60 +8985,50 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX12-LABEL: constant_sextload_v32i8_to_v32i64:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
-; GFX12-NEXT: s_mov_b32 s29, 0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s27, s29
-; GFX12-NEXT: s_mov_b32 s21, s29
-; GFX12-NEXT: s_mov_b32 s25, s29
-; GFX12-NEXT: s_mov_b32 s19, s29
-; GFX12-NEXT: s_mov_b32 s15, s29
-; GFX12-NEXT: s_mov_b32 s13, s29
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
-; GFX12-NEXT: s_mov_b32 s11, s29
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s30, s6, 16
-; GFX12-NEXT: s_lshr_b32 s34, s6, 24
-; GFX12-NEXT: s_lshr_b32 s36, s6, 8
-; GFX12-NEXT: s_lshr_b32 s38, s4, 16
-; GFX12-NEXT: s_lshr_b32 s40, s4, 24
-; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GFX12-NEXT: s_lshr_b32 s34, s6, 16
+; GFX12-NEXT: s_lshr_b32 s36, s6, 24
+; GFX12-NEXT: s_lshr_b32 s38, s6, 8
+; GFX12-NEXT: s_lshr_b32 s40, s4, 16
+; GFX12-NEXT: s_lshr_b32 s42, s4, 24
; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
-; GFX12-NEXT: s_lshr_b32 s42, s4, 8
-; GFX12-NEXT: s_bfe_i64 s[66:67], s[6:7], 0x80000
; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s31
-; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
+; GFX12-NEXT: s_lshr_b32 s44, s4, 8
+; GFX12-NEXT: s_bfe_i64 s[66:67], s[6:7], 0x80000
; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v3, s35
-; GFX12-NEXT: v_dual_mov_b32 v2, s34 :: v_dual_mov_b32 v5, s67
-; GFX12-NEXT: s_lshr_b32 s44, s2, 16
+; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s35
+; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v3, s37
+; GFX12-NEXT: v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v5, s67
+; GFX12-NEXT: s_lshr_b32 s28, s2, 16
; GFX12-NEXT: s_lshr_b32 s46, s2, 24
; GFX12-NEXT: s_bfe_i64 s[64:65], s[4:5], 0x80000
-; GFX12-NEXT: s_ashr_i32 s45, s1, 24
-; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v4, s66 :: v_dual_mov_b32 v7, s37
-; GFX12-NEXT: v_dual_mov_b32 v6, s36 :: v_dual_mov_b32 v9, s39
+; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v4, s66 :: v_dual_mov_b32 v7, s39
+; GFX12-NEXT: v_dual_mov_b32 v6, s38 :: v_dual_mov_b32 v9, s41
; GFX12-NEXT: s_lshr_b32 s48, s2, 8
-; GFX12-NEXT: v_dual_mov_b32 v8, s38 :: v_dual_mov_b32 v11, s41
-; GFX12-NEXT: v_dual_mov_b32 v10, s40 :: v_dual_mov_b32 v13, s65
+; GFX12-NEXT: v_dual_mov_b32 v8, s40 :: v_dual_mov_b32 v11, s43
+; GFX12-NEXT: v_dual_mov_b32 v10, s42 :: v_dual_mov_b32 v13, s65
; GFX12-NEXT: s_lshr_b32 s50, s0, 16
; GFX12-NEXT: s_lshr_b32 s52, s0, 24
; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v12, s64 :: v_dual_mov_b32 v15, s43
-; GFX12-NEXT: s_bfe_i64 s[30:31], s[44:45], 0x80000
-; GFX12-NEXT: v_mov_b32_e32 v14, s42
+; GFX12-NEXT: v_dual_mov_b32 v12, s64 :: v_dual_mov_b32 v15, s45
+; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
+; GFX12-NEXT: v_mov_b32_e32 v14, s44
; GFX12-NEXT: s_lshr_b32 s54, s0, 8
-; GFX12-NEXT: s_lshr_b32 s28, s7, 16
-; GFX12-NEXT: s_lshr_b32 s56, s7, 8
-; GFX12-NEXT: s_lshr_b32 s26, s5, 16
-; GFX12-NEXT: s_lshr_b32 s58, s5, 8
-; GFX12-NEXT: s_lshr_b32 s20, s3, 16
-; GFX12-NEXT: s_lshr_b32 s60, s1, 8
-; GFX12-NEXT: s_bfe_i64 s[62:63], s[2:3], 0x80000
-; GFX12-NEXT: s_ashr_i32 s55, s3, 31
-; GFX12-NEXT: s_ashr_i32 s57, s3, 24
+; GFX12-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x80000
; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
+; GFX12-NEXT: s_lshr_b32 s56, s7, 16
+; GFX12-NEXT: s_lshr_b32 s58, s5, 16
+; GFX12-NEXT: s_lshr_b32 s60, s1, 8
+; GFX12-NEXT: s_mov_b32 s62, s1
+; GFX12-NEXT: s_ashr_i32 s57, s1, 24
+; GFX12-NEXT: s_ashr_i32 s59, s3, 31
+; GFX12-NEXT: s_ashr_i32 s61, s3, 24
+; GFX12-NEXT: s_ashr_i32 s63, s5, 31
; GFX12-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000
; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000
; GFX12-NEXT: s_clause 0x3
@@ -9110,39 +9036,40 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:192
; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:144
; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:128
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v3, s47
-; GFX12-NEXT: v_dual_mov_b32 v1, s31 :: v_dual_mov_b32 v2, s46
-; GFX12-NEXT: v_mov_b32_e32 v5, s63
+; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v3, s47
+; GFX12-NEXT: v_dual_mov_b32 v1, s29 :: v_dual_mov_b32 v2, s46
+; GFX12-NEXT: v_mov_b32_e32 v5, s31
+; GFX12-NEXT: s_lshr_b32 s26, s7, 8
; GFX12-NEXT: s_mov_b32 s24, s7
-; GFX12-NEXT: s_mov_b32 s18, s5
-; GFX12-NEXT: s_lshr_b32 s16, s3, 8
-; GFX12-NEXT: s_mov_b32 s14, s3
-; GFX12-NEXT: s_lshr_b32 s12, s1, 16
-; GFX12-NEXT: s_mov_b32 s10, s1
; GFX12-NEXT: s_bfe_i64 s[22:23], s[0:1], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v4, s30 :: v_dual_mov_b32 v7, s49
+; GFX12-NEXT: v_dual_mov_b32 v6, s48 :: v_dual_mov_b32 v9, s51
+; GFX12-NEXT: s_lshr_b32 s18, s5, 8
+; GFX12-NEXT: s_mov_b32 s20, s5
+; GFX12-NEXT: s_lshr_b32 s16, s3, 16
+; GFX12-NEXT: s_lshr_b32 s12, s3, 8
+; GFX12-NEXT: s_mov_b32 s14, s3
+; GFX12-NEXT: s_lshr_b32 s10, s1, 16
; GFX12-NEXT: s_ashr_i32 s33, s1, 31
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[62:63], 0x80000
; GFX12-NEXT: s_bfe_i64 s[0:1], s[60:61], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[20:21], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[20:21], s[58:59], 0x80000
-; GFX12-NEXT: s_ashr_i32 s58, s5, 31
-; GFX12-NEXT: s_ashr_i32 s59, s5, 24
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[26:27], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[26:27], s[56:57], 0x80000
-; GFX12-NEXT: s_ashr_i32 s56, s7, 31
-; GFX12-NEXT: s_ashr_i32 s60, s7, 24
-; GFX12-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[28:29], s[54:55], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v4, s62 :: v_dual_mov_b32 v7, s49
-; GFX12-NEXT: v_dual_mov_b32 v6, s48 :: v_dual_mov_b32 v9, s51
+; GFX12-NEXT: s_ashr_i32 s60, s5, 24
+; GFX12-NEXT: s_bfe_i64 s[4:5], s[58:59], 0x80000
+; GFX12-NEXT: s_ashr_i32 s58, s7, 31
+; GFX12-NEXT: s_ashr_i32 s62, s7, 24
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[56:57], 0x80000
; GFX12-NEXT: v_dual_mov_b32 v8, s50 :: v_dual_mov_b32 v11, s53
; GFX12-NEXT: v_dual_mov_b32 v10, s52 :: v_dual_mov_b32 v13, s23
; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v15, s29
-; GFX12-NEXT: v_dual_mov_b32 v14, s28 :: v_dual_mov_b32 v17, s7
+; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v15, s55
+; GFX12-NEXT: v_dual_mov_b32 v14, s54 :: v_dual_mov_b32 v17, s7
+; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v19, s56
-; GFX12-NEXT: v_dual_mov_b32 v18, s60 :: v_dual_mov_b32 v21, s25
+; GFX12-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v19, s58
+; GFX12-NEXT: v_dual_mov_b32 v18, s62 :: v_dual_mov_b32 v21, s25
+; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
; GFX12-NEXT: v_dual_mov_b32 v20, s24 :: v_dual_mov_b32 v23, s27
; GFX12-NEXT: v_mov_b32_e32 v22, s26
; GFX12-NEXT: s_clause 0x5
@@ -9152,22 +9079,21 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9]
; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:240
; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] offset:224
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s58
-; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s59
-; GFX12-NEXT: v_mov_b32_e32 v5, s19
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s63
+; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s60
+; GFX12-NEXT: v_mov_b32_e32 v5, s21
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v7, s21
-; GFX12-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v9, s3
; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v11, s55
-; GFX12-NEXT: v_dual_mov_b32 v10, s57 :: v_dual_mov_b32 v13, s15
+; GFX12-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v7, s19
+; GFX12-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s17
; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v15, s17
-; GFX12-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v17, s13
-; GFX12-NEXT: v_dual_mov_b32 v16, s12 :: v_dual_mov_b32 v19, s33
-; GFX12-NEXT: v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v21, s11
-; GFX12-NEXT: v_dual_mov_b32 v20, s10 :: v_dual_mov_b32 v23, s1
+; GFX12-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v11, s59
+; GFX12-NEXT: v_dual_mov_b32 v10, s61 :: v_dual_mov_b32 v13, s15
+; GFX12-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v15, s13
+; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v17, s11
+; GFX12-NEXT: v_dual_mov_b32 v16, s10 :: v_dual_mov_b32 v19, s33
+; GFX12-NEXT: v_dual_mov_b32 v18, s57 :: v_dual_mov_b32 v21, s3
+; GFX12-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v23, s1
; GFX12-NEXT: v_mov_b32_e32 v22, s0
; GFX12-NEXT: s_clause 0x5
; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:176
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
index 7d0fa88e95792..cb17f01853221 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -6267,29 +6267,26 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out,
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s10, v1
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v1
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v0
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s10, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s5, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 24
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s5, 8
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s10, 8
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[10:11], 0x80000
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s4, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s5, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 24
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 8
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 8
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x80000
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s10, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s18, s10, 24
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[14:15], 0x80000
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s4, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s18, s4, 24
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s18
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s15
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16
@@ -6298,17 +6295,17 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out,
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s7
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s12
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s13
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s10
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s11
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s8
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s9
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s4
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s5
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: s_endpgm
;
@@ -6322,48 +6319,47 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GCN-HSA-NEXT: s_mov_b32 s3, 0
-; GCN-HSA-NEXT: s_mov_b32 s5, s3
-; GCN-HSA-NEXT: s_mov_b32 s7, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v1
-; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v0
-; GCN-HSA-NEXT: s_lshr_b32 s2, s6, 16
-; GCN-HSA-NEXT: s_lshr_b32 s8, s4, 16
-; GCN-HSA-NEXT: s_lshr_b32 s10, s4, 24
-; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 8
-; GCN-HSA-NEXT: s_lshr_b32 s4, s6, 8
-; GCN-HSA-NEXT: s_ashr_i32 s13, s6, 31
+; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v1
+; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v0
+; GCN-HSA-NEXT: s_lshr_b32 s4, s2, 16
+; GCN-HSA-NEXT: s_lshr_b32 s6, s3, 16
+; GCN-HSA-NEXT: s_lshr_b32 s8, s3, 24
+; GCN-HSA-NEXT: s_lshr_b32 s10, s3, 8
+; GCN-HSA-NEXT: s_lshr_b32 s12, s2, 8
+; GCN-HSA-NEXT: s_ashr_i32 s13, s2, 31
; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[6:7], 0x80000
-; GCN-HSA-NEXT: s_ashr_i32 s16, s6, 24
-; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[12:13], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[2:3], 0x80000
+; GCN-HSA-NEXT: s_ashr_i32 s16, s2, 24
+; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT: s_add_u32 s4, s0, 48
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s16
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s9
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s10
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s11
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
@@ -6371,8 +6367,6 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s14
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s15
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; GCN-HSA-NEXT: s_endpgm
@@ -6388,27 +6382,25 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v1
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v1
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v0
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s8, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s5, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s4, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s5, 16
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 24
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s5, 8
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s8, 8
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s15, s8, 31
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s18, s8, 24
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 8
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s15, s4, 31
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s18, s4, 24
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[8:9], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[14:15], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s18
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s15
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
@@ -6416,14 +6408,14 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s17
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s9
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s10
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s11
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s8
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s5
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
@@ -6939,83 +6931,77 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, 0
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s13
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, s13
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, s13
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s17, s13
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s13
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v3
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s18, v0
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s19, v1
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s4, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s4, 24
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s4, 8
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s18, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s18, 24
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s18, 8
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 16
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v0
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v1
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s4, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 24
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s4, 8
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s6, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s6, 24
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s6, 8
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s5, 16
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 8
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s5
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s19, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s19, 8
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s19
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[18:19], 0x80000
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s5
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s7, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s7, 8
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s7
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s19, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s19, 24
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[6:7], 0x80000
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s7, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s7, 24
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 31
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s5, 24
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[16:17], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[14:15], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[30:31], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[28:29], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s34
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s35
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s38
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s33
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s18
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s19
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s20
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s22
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s23
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s26
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s27
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s12
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s13
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s14
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s15
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s31
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s29
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s26
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s17
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s14
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s15
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s22
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s23
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s10
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s11
@@ -7039,115 +7025,109 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GCN-HSA-NEXT: s_mov_b32 s3, 0
-; GCN-HSA-NEXT: s_mov_b32 s5, s3
-; GCN-HSA-NEXT: s_mov_b32 s7, s3
-; GCN-HSA-NEXT: s_mov_b32 s9, s3
-; GCN-HSA-NEXT: s_mov_b32 s11, s3
-; GCN-HSA-NEXT: s_mov_b32 s13, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT: v_readfirstlane_b32 s14, v2
-; GCN-HSA-NEXT: v_readfirstlane_b32 s15, v3
-; GCN-HSA-NEXT: v_readfirstlane_b32 s16, v0
-; GCN-HSA-NEXT: v_readfirstlane_b32 s17, v1
-; GCN-HSA-NEXT: s_lshr_b32 s18, s14, 16
-; GCN-HSA-NEXT: s_lshr_b32 s20, s14, 24
-; GCN-HSA-NEXT: s_lshr_b32 s22, s14, 8
-; GCN-HSA-NEXT: s_lshr_b32 s2, s15, 16
-; GCN-HSA-NEXT: s_lshr_b32 s4, s15, 8
-; GCN-HSA-NEXT: s_mov_b32 s6, s15
-; GCN-HSA-NEXT: s_ashr_i32 s8, s15, 31
-; GCN-HSA-NEXT: s_ashr_i32 s10, s15, 24
-; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15
-; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[16:17], 0x80000
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s10
-; GCN-HSA-NEXT: s_ashr_i32 s10, s17, 31
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, s8
-; GCN-HSA-NEXT: s_ashr_i32 s8, s17, 24
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14
-; GCN-HSA-NEXT: s_lshr_b32 s14, s16, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s8
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s10
-; GCN-HSA-NEXT: s_lshr_b32 s24, s16, 24
-; GCN-HSA-NEXT: s_lshr_b32 s8, s17, 16
-; GCN-HSA-NEXT: s_lshr_b32 s10, s17, 8
-; GCN-HSA-NEXT: s_mov_b32 s12, s17
-; GCN-HSA-NEXT: s_lshr_b32 s16, s16, 8
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15
-; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2
+; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3
+; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v0
+; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v1
+; GCN-HSA-NEXT: s_lshr_b32 s6, s2, 16
+; GCN-HSA-NEXT: s_lshr_b32 s8, s2, 24
+; GCN-HSA-NEXT: s_lshr_b32 s10, s2, 8
+; GCN-HSA-NEXT: s_lshr_b32 s18, s3, 16
+; GCN-HSA-NEXT: s_lshr_b32 s20, s3, 8
+; GCN-HSA-NEXT: s_mov_b32 s22, s3
+; GCN-HSA-NEXT: s_ashr_i32 s7, s3, 31
+; GCN-HSA-NEXT: s_ashr_i32 s9, s3, 24
; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
+; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 16
+; GCN-HSA-NEXT: s_lshr_b32 s14, s4, 24
+; GCN-HSA-NEXT: s_lshr_b32 s16, s4, 8
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000
+; GCN-HSA-NEXT: s_ashr_i32 s4, s5, 24
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
+; GCN-HSA-NEXT: s_lshr_b32 s2, s5, 16
+; GCN-HSA-NEXT: s_ashr_i32 s3, s5, 31
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4
+; GCN-HSA-NEXT: s_lshr_b32 s4, s5, 8
+; GCN-HSA-NEXT: s_mov_b32 s24, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s9
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3
; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17
-; GCN-HSA-NEXT: s_add_u32 s16, s0, 0x50
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6
+; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x50
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[2:5]
-; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s18
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s19
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s20
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s21
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15]
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 64
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7
+; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GCN-HSA-NEXT: s_add_u32 s6, s0, 64
+; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s6
+; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s14
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s15
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s24
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s25
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s18
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s23
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s8
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s9
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s13
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s10
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s11
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
; GCN-HSA-NEXT: s_endpgm
@@ -7164,69 +7144,65 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s15, 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s15
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s15
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s15
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s12, v0
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s4, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s4, 24
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s4, 24
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s13, v1
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s4, 8
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s12, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s12, 24
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s12, 8
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s4, 8
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s6, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s6, 24
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s6, 8
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s5, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s5, 8
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s5
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s13, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s13, 8
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s13
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[12:13], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s5, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 8
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s5
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s7, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s7, 8
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s30, s7
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s29, s13, 31
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s31, s13, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s29, s7, 31
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s31, s7, 24
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s17
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s18
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s19
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s14
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s15
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s17
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s5, 31
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s38, s5, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[10:11], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[28:29], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s34
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s35
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s21
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s19
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s22
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s23
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s24
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s25
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s26
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s27
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s20
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s21
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s22
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s23
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s24
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s25
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s38
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s33
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s31
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s29
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s14
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s15
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s26
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s27
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
@@ -8195,184 +8171,170 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s37, s7
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, s7
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s17, s7
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, s7
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, s7
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s21, s7
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s23, s7
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s25, s7
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s27, s7
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s28, v2
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s29, v3
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s34, v0
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s35, v1
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s18, v2
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s19, v3
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s22, v0
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s23, v1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s30, v6
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s31, v7
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v4
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v5
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s28, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s28, 24
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[28:29], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[34:35], 0x80000
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s38
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s39
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[30:31], 0x80000
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s40
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s41
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[4:5], 0x80000
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s38
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s39
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s28, 8
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s40
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s41
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s34, 16
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[8:9], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[6:7], 0x80000
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s38
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s39
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s34, 24
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s40
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s41
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s34, 8
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s29
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s24, v6
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s25, v7
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s12, v4
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s13, v5
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s18, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s18, 24
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s18, 8
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s22, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s22, 24
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s22, 8
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s24, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s24, 24
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s24, 8
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s12, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s12, 24
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s12, 8
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s19, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[12:13], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[18:19], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[22:23], 0x80000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s42
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s43
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[24:25], 0x80000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s44
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s45
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s19, 8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s42
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s43
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s19
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s40
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s41
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s23, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[10:11], 0x80000
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s36
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s37
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s30, 16
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s38
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s39
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s30, 24
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s29, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s29, 8
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s35, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s35, 8
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s16, s35
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s31, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s31, 8
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s31
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[22:23], 0x80000
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s38
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s39
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s23, 8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s36
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s37
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s23
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[12:13], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:208
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s36
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s37
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s31, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s35, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s44, s35, 24
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s29, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s29, 24
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s31, 24
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s30, 8
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[24:25], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[26:27], 0x80000
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s36
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s37
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s4, 16
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s4, 24
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s5, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s5, 8
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s5
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s36
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s37
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s25, 16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s25, 8
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s29
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s5, 31
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s35
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s46, s5, 24
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s4, 8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s25, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s23, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s23, 24
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s19, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s19, 24
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s44, s25, 24
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s25
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[16:17], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[24:25], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[22:23], 0x80000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s13, 16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s26
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s27
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s13, 8
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s13, 31
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s46, s13, 24
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s13
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[12:13], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[22:23], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[28:29], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[36:37], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[38:39], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[40:41], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[40:41], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[38:39], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s44
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s41
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s30
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s31
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s6
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s7
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s36
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s37
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s41
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s39
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s9
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s10
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s11
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s42
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s43
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s14
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s15
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s45
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s33
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s6
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s7
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s42
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s43
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s17
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s44
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s33
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s20
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s8
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s9
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s34
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s35
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s18
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s19
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s46
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s45
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s46
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s39
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s30
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s31
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s28
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s29
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:160
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s26
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s26
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s27
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s14
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s15
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s18
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s19
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s16
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s17
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_v32i8_to_v32i64:
@@ -8390,172 +8352,174 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GCN-HSA-NEXT: s_mov_b32 s3, 0
-; GCN-HSA-NEXT: s_mov_b32 s19, s3
-; GCN-HSA-NEXT: s_mov_b32 s5, s3
-; GCN-HSA-NEXT: s_mov_b32 s7, s3
-; GCN-HSA-NEXT: s_mov_b32 s29, s3
-; GCN-HSA-NEXT: s_mov_b32 s15, s3
-; GCN-HSA-NEXT: s_mov_b32 s31, s3
-; GCN-HSA-NEXT: s_mov_b32 s13, s3
-; GCN-HSA-NEXT: s_mov_b32 s9, s3
-; GCN-HSA-NEXT: s_mov_b32 s11, s3
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
-; GCN-HSA-NEXT: v_readfirstlane_b32 s16, v6
-; GCN-HSA-NEXT: v_readfirstlane_b32 s17, v7
-; GCN-HSA-NEXT: v_readfirstlane_b32 s20, v4
-; GCN-HSA-NEXT: v_readfirstlane_b32 s21, v5
-; GCN-HSA-NEXT: s_lshr_b32 s18, s16, 24
-; GCN-HSA-NEXT: s_lshr_b32 s14, s16, 16
-; GCN-HSA-NEXT: s_lshr_b32 s26, s20, 16
-; GCN-HSA-NEXT: s_lshr_b32 s28, s20, 24
-; GCN-HSA-NEXT: s_lshr_b32 s30, s20, 8
-; GCN-HSA-NEXT: s_lshr_b32 s4, s17, 8
-; GCN-HSA-NEXT: s_mov_b32 s6, s17
-; GCN-HSA-NEXT: s_lshr_b32 s12, s21, 16
-; GCN-HSA-NEXT: s_lshr_b32 s8, s21, 8
-; GCN-HSA-NEXT: s_mov_b32 s10, s21
-; GCN-HSA-NEXT: s_ashr_i32 s37, s21, 31
-; GCN-HSA-NEXT: s_ashr_i32 s38, s21, 24
-; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[20:21], 0x80000
-; GCN-HSA-NEXT: s_lshr_b32 s34, s16, 8
-; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[18:19], 0x80000
+; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v6
+; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v4
+; GCN-HSA-NEXT: v_readfirstlane_b32 s9, v5
+; GCN-HSA-NEXT: v_readfirstlane_b32 s7, v7
+; GCN-HSA-NEXT: s_lshr_b32 s20, s6, 16
+; GCN-HSA-NEXT: s_lshr_b32 s18, s6, 24
+; GCN-HSA-NEXT: s_lshr_b32 s10, s8, 16
+; GCN-HSA-NEXT: s_lshr_b32 s2, s8, 24
+; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x80000
+; GCN-HSA-NEXT: s_lshr_b32 s16, s6, 8
+; GCN-HSA-NEXT: s_lshr_b32 s4, s8, 8
+; GCN-HSA-NEXT: s_lshr_b32 s12, s7, 16
+; GCN-HSA-NEXT: s_lshr_b32 s14, s7, 8
+; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x80000
+; GCN-HSA-NEXT: s_lshr_b32 s6, s9, 16
+; GCN-HSA-NEXT: s_mov_b32 s28, s9
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s26
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s27
+; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[18:19], 0x80000
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT: v_readfirstlane_b32 s48, v2
-; GCN-HSA-NEXT: v_readfirstlane_b32 s49, v3
-; GCN-HSA-NEXT: v_readfirstlane_b32 s50, v0
-; GCN-HSA-NEXT: v_readfirstlane_b32 s51, v1
-; GCN-HSA-NEXT: s_lshr_b32 s2, s17, 16
-; GCN-HSA-NEXT: s_ashr_i32 s41, s17, 31
-; GCN-HSA-NEXT: s_ashr_i32 s42, s17, 24
-; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[16:17], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[14:15], 0x80000
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s20
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, s21
-; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[4:5], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[30:31], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[54:55], s[28:29], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[56:57], s[26:27], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[58:59], s[34:35], 0x80000
-; GCN-HSA-NEXT: s_lshr_b32 s18, s49, 16
-; GCN-HSA-NEXT: s_lshr_b32 s28, s49, 8
-; GCN-HSA-NEXT: s_mov_b32 s14, s49
-; GCN-HSA-NEXT: s_lshr_b32 s30, s51, 16
-; GCN-HSA-NEXT: s_lshr_b32 s6, s51, 8
-; GCN-HSA-NEXT: s_mov_b32 s4, s51
-; GCN-HSA-NEXT: s_lshr_b32 s34, s50, 16
-; GCN-HSA-NEXT: s_lshr_b32 s60, s50, 24
-; GCN-HSA-NEXT: s_lshr_b32 s62, s50, 8
-; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[50:51], 0x80000
-; GCN-HSA-NEXT: s_lshr_b32 s50, s48, 16
-; GCN-HSA-NEXT: s_lshr_b32 s64, s48, 24
-; GCN-HSA-NEXT: s_lshr_b32 s66, s48, 8
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17
-; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
+; GCN-HSA-NEXT: v_readfirstlane_b32 s40, v2
+; GCN-HSA-NEXT: v_readfirstlane_b32 s41, v3
+; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[10:11], 0x80000
+; GCN-HSA-NEXT: v_readfirstlane_b32 s44, v0
+; GCN-HSA-NEXT: v_readfirstlane_b32 s45, v1
+; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
+; GCN-HSA-NEXT: s_mov_b32 s22, s7
+; GCN-HSA-NEXT: s_lshr_b32 s8, s9, 8
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25
+; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[16:17], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[28:29], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[14:15], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[12:13], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[4:5], 0x80000
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s20
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s26
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s27
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s42
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT: s_lshr_b32 s42, s44, 16
+; GCN-HSA-NEXT: s_lshr_b32 s48, s44, 24
+; GCN-HSA-NEXT: s_lshr_b32 s28, s44, 8
+; GCN-HSA-NEXT: s_lshr_b32 s6, s45, 16
+; GCN-HSA-NEXT: s_lshr_b32 s2, s45, 8
+; GCN-HSA-NEXT: s_mov_b32 s4, s45
+; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[44:45], 0x80000
+; GCN-HSA-NEXT: s_lshr_b32 s44, s40, 16
+; GCN-HSA-NEXT: s_lshr_b32 s50, s40, 24
+; GCN-HSA-NEXT: s_lshr_b32 s52, s40, 8
+; GCN-HSA-NEXT: s_lshr_b32 s20, s41, 16
+; GCN-HSA-NEXT: s_lshr_b32 s12, s41, 8
+; GCN-HSA-NEXT: s_mov_b32 s14, s41
+; GCN-HSA-NEXT: s_ashr_i32 s33, s9, 31
+; GCN-HSA-NEXT: s_ashr_i32 s37, s7, 31
+; GCN-HSA-NEXT: s_ashr_i32 s38, s7, 24
+; GCN-HSA-NEXT: s_ashr_i32 s34, s9, 24
; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[12:13], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[2:3], 0x80000
-; GCN-HSA-NEXT: s_ashr_i32 s33, s51, 31
-; GCN-HSA-NEXT: s_ashr_i32 s36, s51, 24
-; GCN-HSA-NEXT: s_ashr_i32 s39, s49, 31
-; GCN-HSA-NEXT: s_ashr_i32 s40, s49, 24
-; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s21
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s31
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s43
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
+; GCN-HSA-NEXT: s_ashr_i32 s30, s45, 31
+; GCN-HSA-NEXT: s_ashr_i32 s31, s45, 24
+; GCN-HSA-NEXT: s_ashr_i32 s35, s41, 31
+; GCN-HSA-NEXT: s_ashr_i32 s36, s41, 24
+; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[6:7], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[28:29], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[62:63], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[60:61], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[60:61], s[66:67], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[62:63], s[64:65], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000
-; GCN-HSA-NEXT: s_add_u32 s64, s0, 0x50
-; GCN-HSA-NEXT: s_addc_u32 s65, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s46
-; GCN-HSA-NEXT: s_add_u32 s46, s0, 64
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s47
-; GCN-HSA-NEXT: s_addc_u32 s47, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, s46
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s64
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s47
-; GCN-HSA-NEXT: s_add_u32 s46, s0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s65
-; GCN-HSA-NEXT: s_addc_u32 s47, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s58
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s34
-; GCN-HSA-NEXT: s_add_u32 s34, s0, 0xd0
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s35
-; GCN-HSA-NEXT: s_addc_u32 s35, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s34
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s35
-; GCN-HSA-NEXT: s_add_u32 s34, s0, 0xc0
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s59
-; GCN-HSA-NEXT: s_addc_u32 s35, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46
-; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s56
+; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
+; GCN-HSA-NEXT: s_add_u32 s54, s0, 0x50
+; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s54
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s55
+; GCN-HSA-NEXT: s_add_u32 s54, s0, 64
+; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s54
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, s55
+; GCN-HSA-NEXT: s_add_u32 s54, s0, 16
+; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, s40
+; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xd0
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s41
+; GCN-HSA-NEXT: s_addc_u32 s41, s1, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
+; GCN-HSA-NEXT: v_mov_b32_e32 v28, s54
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s40
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s41
+; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xc0
+; GCN-HSA-NEXT: v_mov_b32_e32 v29, s55
+; GCN-HSA-NEXT: s_addc_u32 s41, s1, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26
; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x90
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s57
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s54
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s55
-; GCN-HSA-NEXT: v_mov_b32_e32 v29, s47
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27
; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[8:11]
-; GCN-HSA-NEXT: v_mov_b32_e32 v29, s27
-; GCN-HSA-NEXT: v_mov_b32_e32 v28, s26
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s26
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1
; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x80
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s46
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0
; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24
+; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, s44
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s24
; GCN-HSA-NEXT: s_add_u32 s24, s0, 0x70
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s44
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s45
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s52
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s53
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s45
+; GCN-HSA-NEXT: v_mov_b32_e32 v18, s50
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, s51
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s40
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s25
; GCN-HSA-NEXT: s_addc_u32 s25, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s30
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, s31
-; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[12:15]
-; GCN-HSA-NEXT: v_mov_b32_e32 v31, s27
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s20
-; GCN-HSA-NEXT: s_add_u32 s20, s0, 0x60
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s52
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s42
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s43
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s48
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s49
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26
+; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[16:19]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23]
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s18
+; GCN-HSA-NEXT: s_add_u32 s18, s0, 0x60
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29
-; GCN-HSA-NEXT: v_mov_b32_e32 v30, s26
-; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[4:7]
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s21
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s24
-; GCN-HSA-NEXT: s_addc_u32 s21, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s20
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s22
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s42
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s23
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s25
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, s21
-; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[8:11]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[12:15]
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, s27
+; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15]
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s19
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24
+; GCN-HSA-NEXT: s_addc_u32 s19, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s18
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s38
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s19
+; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16
; GCN-HSA-NEXT: s_add_u32 s16, s0, 48
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17
; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v16, s50
+; GCN-HSA-NEXT: s_nop 0
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8
; GCN-HSA-NEXT: s_add_u32 s8, s0, 32
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9
@@ -8568,10 +8532,10 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s40
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9
; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xe0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -8583,34 +8547,24 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v17, s51
+; GCN-HSA-NEXT: s_nop 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6
; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xb0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7
; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6
; GCN-HSA-NEXT: s_add_u32 s0, s0, 0xa0
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v18, s62
-; GCN-HSA-NEXT: v_mov_b32_e32 v19, s63
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, s34
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT: v_mov_b32_e32 v20, s48
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, s49
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s60
-; GCN-HSA-NEXT: v_mov_b32_e32 v23, s61
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s35
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[20:23]
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
@@ -8628,161 +8582,151 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s31, 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s25, s31
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s23, s31
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s17, s31
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s15, s31
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s31
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s31
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s31
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s4, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s4, 24
+; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v7
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s11, v5
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s12, v0
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s4, 8
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v4
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s4, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s4, 24
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s4, 8
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s9, 8
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s39, s11, 24
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v6
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s6, 8
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s10, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s10, 24
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s10, 8
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s11, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s11, 8
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s66, s11
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[10:11], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[68:69], s[4:5], 0x80000
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s11, 31
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s13, v1
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s12, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s12, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[70:71], s[4:5], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s40
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s41
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[38:39], 0x80000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s8, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s8, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s69
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s60
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s61
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s40
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s41
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s42
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43
-; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s18, v6
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s12, 8
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s18, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s70
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s71
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s44
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s18, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[12:13], 0x80000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s8, 8
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s19, v7
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s46
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s47
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x80000
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s48
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s26, v4
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s18, 8
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s26, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s50
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s26, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[18:19], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s27, v5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s52
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s53
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s54
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s26, 8
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s5, 16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s56
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[26:27], 0x80000
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s54
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s5, 8
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s58
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s59
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s60
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s24, s5
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s63, s19, 24
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s69, s5, 31
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s5, 8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s56
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s58
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s26, s5
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s71, s5, 31
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s72, s5, 24
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s13, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s7, 16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s36
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s37
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s27, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s68, s27, 8
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s27
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s27, 31
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s38, s27, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[24:25], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[62:63], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s13, 8
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s7, 8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s72
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s69
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s16, s13
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s65, s13, 31
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s67, s13, 24
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s71
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, s7
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s67, s7, 31
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s70, s7, 24
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s19, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s9, 16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s19, 8
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s19
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s39, s19, 31
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[16:17], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[64:65], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s9
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s67
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s67
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s63, s9, 31
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s65, s9, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[10:11], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[66:67], 0x80000
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s6, 16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s63
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s39
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s65
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s6, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[62:63], 0x80000
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x80000
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[68:69], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT: s_nop 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s44
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s38
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s39
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s45
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s46
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s47
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT: s_nop 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:144
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4
diff --git a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll
index e439dfda08e0a..ba31b353ee1f6 100644
--- a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll
@@ -26,7 +26,7 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) {
; CHECK-NEXT: vmov.32 d9[1], r5
; CHECK-NEXT: sbcs r5, r5, #0
; CHECK-NEXT: mov r5, #0
-; CHECK-NEXT: mvn r12, #0
+; CHECK-NEXT: mvn r4, #0
; CHECK-NEXT: movwlt r5, #1
; CHECK-NEXT: cmp r5, #0
; CHECK-NEXT: mvnne r5, #0
@@ -34,6 +34,7 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) {
; CHECK-NEXT: sbcs r0, r1, #0
; CHECK-NEXT: vmov.32 d8[1], r1
; CHECK-NEXT: mov r0, #0
+; CHECK-NEXT: vmov.i32 q10, #0x80000000
; CHECK-NEXT: movwlt r0, #1
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: vdup.32 d19, r5
@@ -42,24 +43,22 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) {
; CHECK-NEXT: mov r2, #0
; CHECK-NEXT: vdup.32 d18, r0
; CHECK-NEXT: vbit q8, q4, q9
-; CHECK-NEXT: adr r4, .LCPI0_1
-; CHECK-NEXT: vld1.64 {d18, d19}, [r4:128]
; CHECK-NEXT: vmov r0, r1, d17
; CHECK-NEXT: vmov r3, r5, d16
; CHECK-NEXT: rsbs r0, r0, #-2147483648
-; CHECK-NEXT: sbcs r0, r12, r1
+; CHECK-NEXT: sbcs r0, r4, r1
; CHECK-NEXT: mov r0, #0
; CHECK-NEXT: movwlt r0, #1
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: mvnne r0, #0
; CHECK-NEXT: rsbs r1, r3, #-2147483648
-; CHECK-NEXT: sbcs r1, r12, r5
-; CHECK-NEXT: vdup.32 d21, r0
+; CHECK-NEXT: sbcs r1, r4, r5
+; CHECK-NEXT: vdup.32 d19, r0
; CHECK-NEXT: movwlt r2, #1
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: mvnne r2, #0
-; CHECK-NEXT: vdup.32 d20, r2
-; CHECK-NEXT: vbif q8, q9, q10
+; CHECK-NEXT: vdup.32 d18, r2
+; CHECK-NEXT: vbif q8, q10, q9
; CHECK-NEXT: vmovn.i64 d0, q8
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: pop {r4, r5, r11, pc}
@@ -70,11 +69,6 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) {
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 2147483647 @ 0x7fffffff
; CHECK-NEXT: .long 0 @ 0x0
-; CHECK-NEXT: .LCPI0_1:
-; CHECK-NEXT: .long 2147483648 @ 0x80000000
-; CHECK-NEXT: .long 4294967295 @ 0xffffffff
-; CHECK-NEXT: .long 2147483648 @ 0x80000000
-; CHECK-NEXT: .long 4294967295 @ 0xffffffff
entry:
%conv = fptosi <2 x double> %x to <2 x i64>
%0 = icmp slt <2 x i64> %conv, <i64 2147483647, i64 2147483647>
@@ -100,22 +94,20 @@ define <2 x i32> @utest_f64i32(<2 x double> %x) {
; CHECK-NEXT: vmov r0, r1, d8
; CHECK-NEXT: vmov.32 d9[0], r4
; CHECK-NEXT: bl __aeabi_d2ulz
-; CHECK-NEXT: vmov.32 d8[0], r0
; CHECK-NEXT: mvn r3, #0
; CHECK-NEXT: subs r4, r4, r3
-; CHECK-NEXT: mov r2, #0
-; CHECK-NEXT: vmov.32 d9[1], r5
; CHECK-NEXT: sbcs r5, r5, #0
+; CHECK-NEXT: vmov.32 d8[0], r0
; CHECK-NEXT: mov r5, #0
+; CHECK-NEXT: mov r2, #0
; CHECK-NEXT: movwlo r5, #1
; CHECK-NEXT: cmp r5, #0
; CHECK-NEXT: mvnne r5, #0
; CHECK-NEXT: subs r0, r0, r3
; CHECK-NEXT: sbcs r0, r1, #0
-; CHECK-NEXT: vmov.32 d8[1], r1
+; CHECK-NEXT: vdup.32 d17, r5
; CHECK-NEXT: movwlo r2, #1
; CHECK-NEXT: cmp r2, #0
-; CHECK-NEXT: vdup.32 d17, r5
; CHECK-NEXT: mvnne r2, #0
; CHECK-NEXT: vdup.32 d16, r2
; CHECK-NEXT: vand q9, q4, q8
@@ -177,11 +169,11 @@ define <2 x i32> @ustest_f64i32(<2 x double> %x) {
; CHECK-NEXT: mvnne r0, #0
; CHECK-NEXT: rsbs r1, r3, #0
; CHECK-NEXT: rscs r1, r5, #0
-; CHECK-NEXT: vdup.32 d19, r0
+; CHECK-NEXT: vmov.32 d19[0], r0
; CHECK-NEXT: movwlt r2, #1
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: mvnne r2, #0
-; CHECK-NEXT: vdup.32 d18, r2
+; CHECK-NEXT: vmov.32 d18[0], r2
; CHECK-NEXT: vand q8, q9, q8
; CHECK-NEXT: vmovn.i64 d0, q8
; CHECK-NEXT: vpop {d8, d9}
@@ -332,61 +324,57 @@ define <4 x i32> @utest_f32i32(<4 x float> %x) {
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: vorr q4, q0, q0
-; CHECK-NEXT: vmov r0, s19
+; CHECK-NEXT: vmov r0, s17
; CHECK-NEXT: bl __aeabi_f2ulz
-; CHECK-NEXT: mov r8, r0
+; CHECK-NEXT: mov r10, r0
; CHECK-NEXT: vmov r0, s16
-; CHECK-NEXT: mov r9, r1
-; CHECK-NEXT: vmov r4, s17
-; CHECK-NEXT: vmov r6, s18
-; CHECK-NEXT: vmov.32 d9[0], r8
+; CHECK-NEXT: mov r8, r1
+; CHECK-NEXT: vmov r5, s19
+; CHECK-NEXT: vmov r7, s18
+; CHECK-NEXT: vmov.32 d9[0], r10
; CHECK-NEXT: bl __aeabi_f2ulz
-; CHECK-NEXT: mov r10, r0
-; CHECK-NEXT: vmov.32 d10[0], r0
-; CHECK-NEXT: mov r0, r4
-; CHECK-NEXT: mov r7, r1
+; CHECK-NEXT: mov r6, r0
+; CHECK-NEXT: vmov.32 d8[0], r0
+; CHECK-NEXT: mov r0, r5
+; CHECK-NEXT: mov r9, r1
; CHECK-NEXT: bl __aeabi_f2ulz
-; CHECK-NEXT: mov r4, r0
+; CHECK-NEXT: mov r5, r0
; CHECK-NEXT: vmov.32 d11[0], r0
-; CHECK-NEXT: mov r0, r6
-; CHECK-NEXT: mov r5, r1
+; CHECK-NEXT: mov r0, r7
+; CHECK-NEXT: mov r4, r1
; CHECK-NEXT: bl __aeabi_f2ulz
-; CHECK-NEXT: vmov.32 d8[0], r0
; CHECK-NEXT: mvn r3, #0
+; CHECK-NEXT: vmov.32 d10[0], r0
; CHECK-NEXT: subs r0, r0, r3
; CHECK-NEXT: mov r2, #0
; CHECK-NEXT: sbcs r0, r1, #0
-; CHECK-NEXT: vmov.32 d9[1], r9
; CHECK-NEXT: mov r0, #0
; CHECK-NEXT: movwlo r0, #1
; CHECK-NEXT: cmp r0, #0
-; CHECK-NEXT: vmov.32 d8[1], r1
; CHECK-NEXT: mvnne r0, #0
-; CHECK-NEXT: subs r1, r8, r3
-; CHECK-NEXT: sbcs r1, r9, #0
-; CHECK-NEXT: vmov.32 d11[1], r5
+; CHECK-NEXT: subs r1, r5, r3
+; CHECK-NEXT: sbcs r1, r4, #0
; CHECK-NEXT: mov r1, #0
; CHECK-NEXT: movwlo r1, #1
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: mvnne r1, #0
-; CHECK-NEXT: subs r6, r4, r3
-; CHECK-NEXT: sbcs r6, r5, #0
+; CHECK-NEXT: subs r7, r10, r3
+; CHECK-NEXT: sbcs r7, r8, #0
; CHECK-NEXT: vdup.32 d19, r1
-; CHECK-NEXT: mov r6, #0
+; CHECK-NEXT: mov r7, #0
; CHECK-NEXT: vdup.32 d18, r0
-; CHECK-NEXT: movwlo r6, #1
-; CHECK-NEXT: cmp r6, #0
-; CHECK-NEXT: mvnne r6, #0
-; CHECK-NEXT: subs r3, r10, r3
-; CHECK-NEXT: sbcs r3, r7, #0
-; CHECK-NEXT: vmov.32 d10[1], r7
+; CHECK-NEXT: movwlo r7, #1
+; CHECK-NEXT: cmp r7, #0
+; CHECK-NEXT: mvnne r7, #0
+; CHECK-NEXT: subs r3, r6, r3
+; CHECK-NEXT: sbcs r3, r9, #0
+; CHECK-NEXT: vdup.32 d17, r7
; CHECK-NEXT: movwlo r2, #1
; CHECK-NEXT: cmp r2, #0
-; CHECK-NEXT: vdup.32 d17, r6
; CHECK-NEXT: mvnne r2, #0
-; CHECK-NEXT: vand q10, q4, q9
+; CHECK-NEXT: vand q10, q5, q9
; CHECK-NEXT: vdup.32 d16, r2
-; CHECK-NEXT: vand q11, q5, q8
+; CHECK-NEXT: vand q11, q4, q8
; CHECK-NEXT: vorn q9, q10, q9
; CHECK-NEXT: vorn q8, q11, q8
; CHECK-NEXT: vmovn.i64 d1, q9
@@ -409,46 +397,45 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) {
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vorr q4, q0, q0
-; CHECK-NEXT: vmov r0, s18
-; CHECK-NEXT: bl __aeabi_f2lz
-; CHECK-NEXT: mov r5, r0
; CHECK-NEXT: vmov r0, s19
-; CHECK-NEXT: mov r6, r1
; CHECK-NEXT: bl __aeabi_f2lz
-; CHECK-NEXT: mov r2, r0
-; CHECK-NEXT: vmov r0, s17
-; CHECK-NEXT: vmov.32 d17[0], r2
-; CHECK-NEXT: mvn r4, #0
-; CHECK-NEXT: subs r2, r2, r4
+; CHECK-NEXT: mov r6, r0
+; CHECK-NEXT: vmov r0, s18
+; CHECK-NEXT: mov r7, r1
+; CHECK-NEXT: vmov r5, s17
; CHECK-NEXT: vmov r8, s16
-; CHECK-NEXT: vmov.32 d16[0], r5
-; CHECK-NEXT: vmov.i64 q5, #0xffffffff
-; CHECK-NEXT: mov r7, #0
-; CHECK-NEXT: vmov.32 d17[1], r1
-; CHECK-NEXT: sbcs r1, r1, #0
-; CHECK-NEXT: mov r1, #0
-; CHECK-NEXT: movwlt r1, #1
-; CHECK-NEXT: cmp r1, #0
-; CHECK-NEXT: mvnne r1, #0
-; CHECK-NEXT: subs r2, r5, r4
-; CHECK-NEXT: sbcs r2, r6, #0
-; CHECK-NEXT: vdup.32 d19, r1
+; CHECK-NEXT: vmov.32 d9[0], r6
+; CHECK-NEXT: bl __aeabi_f2lz
+; CHECK-NEXT: mvn r9, #0
+; CHECK-NEXT: subs r2, r6, r9
+; CHECK-NEXT: sbcs r2, r7, #0
+; CHECK-NEXT: vmov.32 d8[0], r0
; CHECK-NEXT: mov r2, #0
-; CHECK-NEXT: vmov.32 d16[1], r6
+; CHECK-NEXT: vmov.i64 q5, #0xffffffff
; CHECK-NEXT: movwlt r2, #1
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: mvnne r2, #0
-; CHECK-NEXT: vdup.32 d18, r2
-; CHECK-NEXT: vorr q4, q9, q9
-; CHECK-NEXT: vbsl q4, q8, q5
-; CHECK-NEXT: vmov r10, r9, d8
+; CHECK-NEXT: subs r0, r0, r9
+; CHECK-NEXT: sbcs r0, r1, #0
+; CHECK-NEXT: vmov.32 d9[1], r7
+; CHECK-NEXT: mov r0, #0
+; CHECK-NEXT: mov r4, #0
+; CHECK-NEXT: movwlt r0, #1
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: vmov.32 d8[1], r1
+; CHECK-NEXT: mvnne r0, #0
+; CHECK-NEXT: vdup.32 d17, r2
+; CHECK-NEXT: vdup.32 d16, r0
+; CHECK-NEXT: mov r0, r5
+; CHECK-NEXT: vbif q4, q5, q8
; CHECK-NEXT: bl __aeabi_f2lz
; CHECK-NEXT: mov r5, r0
; CHECK-NEXT: vmov.32 d13[0], r0
; CHECK-NEXT: mov r0, r8
; CHECK-NEXT: mov r6, r1
+; CHECK-NEXT: vmov r7, r10, d8
; CHECK-NEXT: bl __aeabi_f2lz
-; CHECK-NEXT: subs r2, r5, r4
+; CHECK-NEXT: subs r2, r5, r9
; CHECK-NEXT: vmov.32 d12[0], r0
; CHECK-NEXT: sbcs r2, r6, #0
; CHECK-NEXT: mov r2, #0
@@ -456,25 +443,25 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) {
; CHECK-NEXT: movwlt r2, #1
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: mvnne r2, #0
-; CHECK-NEXT: subs r0, r0, r4
+; CHECK-NEXT: subs r0, r0, r9
; CHECK-NEXT: sbcs r0, r1, #0
-; CHECK-NEXT: vmov.32 d12[1], r1
+; CHECK-NEXT: vdup.32 d17, r2
; CHECK-NEXT: mov r0, #0
+; CHECK-NEXT: vmov.32 d12[1], r1
; CHECK-NEXT: movwlt r0, #1
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: mvnne r0, #0
-; CHECK-NEXT: vdup.32 d17, r2
+; CHECK-NEXT: vmov r2, r3, d9
; CHECK-NEXT: vdup.32 d16, r0
-; CHECK-NEXT: vmov r0, r1, d9
+; CHECK-NEXT: rsbs r7, r7, #0
; CHECK-NEXT: vbsl q8, q6, q5
-; CHECK-NEXT: rsbs r6, r10, #0
-; CHECK-NEXT: rscs r6, r9, #0
-; CHECK-NEXT: mov r6, #0
-; CHECK-NEXT: vmov r2, r3, d17
-; CHECK-NEXT: movwlt r6, #1
-; CHECK-NEXT: vmov r5, r4, d16
-; CHECK-NEXT: cmp r6, #0
-; CHECK-NEXT: mvnne r6, #0
+; CHECK-NEXT: rscs r7, r10, #0
+; CHECK-NEXT: mov r7, #0
+; CHECK-NEXT: movwlt r7, #1
+; CHECK-NEXT: cmp r7, #0
+; CHECK-NEXT: vmov r0, r1, d17
+; CHECK-NEXT: mvnne r7, #0
+; CHECK-NEXT: vmov r6, r5, d16
; CHECK-NEXT: rsbs r0, r0, #0
; CHECK-NEXT: rscs r0, r1, #0
; CHECK-NEXT: mov r0, #0
@@ -483,20 +470,20 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) {
; CHECK-NEXT: mvnne r0, #0
; CHECK-NEXT: rsbs r1, r2, #0
; CHECK-NEXT: rscs r1, r3, #0
-; CHECK-NEXT: vdup.32 d21, r0
+; CHECK-NEXT: vmov.32 d19[0], r0
; CHECK-NEXT: mov r1, #0
-; CHECK-NEXT: vdup.32 d20, r6
; CHECK-NEXT: movwlt r1, #1
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: mvnne r1, #0
-; CHECK-NEXT: rsbs r2, r5, #0
-; CHECK-NEXT: rscs r2, r4, #0
-; CHECK-NEXT: vdup.32 d19, r1
-; CHECK-NEXT: movwlt r7, #1
-; CHECK-NEXT: cmp r7, #0
-; CHECK-NEXT: mvnne r7, #0
+; CHECK-NEXT: rsbs r0, r6, #0
+; CHECK-NEXT: rscs r0, r5, #0
+; CHECK-NEXT: vmov.32 d21[0], r1
+; CHECK-NEXT: movwlt r4, #1
+; CHECK-NEXT: cmp r4, #0
+; CHECK-NEXT: vmov.32 d20[0], r7
+; CHECK-NEXT: mvnne r4, #0
+; CHECK-NEXT: vmov.32 d18[0], r4
; CHECK-NEXT: vand q10, q10, q4
-; CHECK-NEXT: vdup.32 d18, r7
; CHECK-NEXT: vand q8, q9, q8
; CHECK-NEXT: vmovn.i64 d1, q10
; CHECK-NEXT: vmovn.i64 d0, q8
@@ -764,151 +751,141 @@ entry:
define <4 x i32> @utest_f16i32(<4 x half> %x) {
; CHECK-NEON-LABEL: utest_f16i32:
; CHECK-NEON: @ %bb.0: @ %entry
-; CHECK-NEON-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEON-NEXT: push {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEON-NEXT: .vsave {d8, d9, d10, d11}
-; CHECK-NEON-NEXT: vpush {d8, d9, d10, d11}
+; CHECK-NEON-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr}
+; CHECK-NEON-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr}
+; CHECK-NEON-NEXT: .vsave {d12, d13}
+; CHECK-NEON-NEXT: vpush {d12, d13}
+; CHECK-NEON-NEXT: .vsave {d8, d9, d10}
+; CHECK-NEON-NEXT: vpush {d8, d9, d10}
; CHECK-NEON-NEXT: vmov r0, s3
; CHECK-NEON-NEXT: vmov.f32 s16, s2
; CHECK-NEON-NEXT: vmov.f32 s18, s1
; CHECK-NEON-NEXT: vmov.f32 s20, s0
; CHECK-NEON-NEXT: bl __aeabi_h2f
; CHECK-NEON-NEXT: bl __aeabi_f2ulz
-; CHECK-NEON-NEXT: mov r8, r0
+; CHECK-NEON-NEXT: mov r4, r0
+; CHECK-NEON-NEXT: vmov r0, s18
+; CHECK-NEON-NEXT: mov r8, r1
+; CHECK-NEON-NEXT: bl __aeabi_h2f
+; CHECK-NEON-NEXT: bl __aeabi_f2ulz
+; CHECK-NEON-NEXT: mov r6, r0
+; CHECK-NEON-NEXT: vmov.32 d13[0], r0
; CHECK-NEON-NEXT: vmov r0, s20
; CHECK-NEON-NEXT: mov r9, r1
-; CHECK-NEON-NEXT: vmov r4, s18
-; CHECK-NEON-NEXT: vmov r6, s16
-; CHECK-NEON-NEXT: vmov.32 d9[0], r8
; CHECK-NEON-NEXT: bl __aeabi_h2f
; CHECK-NEON-NEXT: bl __aeabi_f2ulz
-; CHECK-NEON-NEXT: mov r10, r0
-; CHECK-NEON-NEXT: vmov.32 d10[0], r0
-; CHECK-NEON-NEXT: mov r0, r4
+; CHECK-NEON-NEXT: mov r5, r0
+; CHECK-NEON-NEXT: vmov.32 d12[0], r0
+; CHECK-NEON-NEXT: vmov r0, s16
; CHECK-NEON-NEXT: mov r7, r1
; CHECK-NEON-NEXT: bl __aeabi_h2f
+; CHECK-NEON-NEXT: vmov.32 d9[0], r4
; CHECK-NEON-NEXT: bl __aeabi_f2ulz
-; CHECK-NEON-NEXT: mov r4, r0
-; CHECK-NEON-NEXT: vmov.32 d11[0], r0
-; CHECK-NEON-NEXT: mov r0, r6
-; CHECK-NEON-NEXT: mov r5, r1
-; CHECK-NEON-NEXT: bl __aeabi_h2f
-; CHECK-NEON-NEXT: bl __aeabi_f2ulz
-; CHECK-NEON-NEXT: vmov.32 d8[0], r0
; CHECK-NEON-NEXT: mvn r3, #0
+; CHECK-NEON-NEXT: vmov.32 d8[0], r0
; CHECK-NEON-NEXT: subs r0, r0, r3
; CHECK-NEON-NEXT: mov r2, #0
; CHECK-NEON-NEXT: sbcs r0, r1, #0
-; CHECK-NEON-NEXT: vmov.32 d9[1], r9
; CHECK-NEON-NEXT: mov r0, #0
; CHECK-NEON-NEXT: movwlo r0, #1
; CHECK-NEON-NEXT: cmp r0, #0
-; CHECK-NEON-NEXT: vmov.32 d8[1], r1
; CHECK-NEON-NEXT: mvnne r0, #0
-; CHECK-NEON-NEXT: subs r1, r8, r3
-; CHECK-NEON-NEXT: sbcs r1, r9, #0
-; CHECK-NEON-NEXT: vmov.32 d11[1], r5
+; CHECK-NEON-NEXT: subs r1, r4, r3
+; CHECK-NEON-NEXT: sbcs r1, r8, #0
; CHECK-NEON-NEXT: mov r1, #0
; CHECK-NEON-NEXT: movwlo r1, #1
; CHECK-NEON-NEXT: cmp r1, #0
; CHECK-NEON-NEXT: mvnne r1, #0
-; CHECK-NEON-NEXT: subs r6, r4, r3
-; CHECK-NEON-NEXT: sbcs r6, r5, #0
+; CHECK-NEON-NEXT: subs r6, r6, r3
+; CHECK-NEON-NEXT: sbcs r6, r9, #0
; CHECK-NEON-NEXT: vdup.32 d19, r1
; CHECK-NEON-NEXT: mov r6, #0
; CHECK-NEON-NEXT: vdup.32 d18, r0
; CHECK-NEON-NEXT: movwlo r6, #1
; CHECK-NEON-NEXT: cmp r6, #0
; CHECK-NEON-NEXT: mvnne r6, #0
-; CHECK-NEON-NEXT: subs r3, r10, r3
+; CHECK-NEON-NEXT: subs r3, r5, r3
; CHECK-NEON-NEXT: sbcs r3, r7, #0
-; CHECK-NEON-NEXT: vmov.32 d10[1], r7
+; CHECK-NEON-NEXT: vdup.32 d17, r6
; CHECK-NEON-NEXT: movwlo r2, #1
; CHECK-NEON-NEXT: cmp r2, #0
-; CHECK-NEON-NEXT: vdup.32 d17, r6
; CHECK-NEON-NEXT: mvnne r2, #0
; CHECK-NEON-NEXT: vand q10, q4, q9
; CHECK-NEON-NEXT: vdup.32 d16, r2
-; CHECK-NEON-NEXT: vand q11, q5, q8
+; CHECK-NEON-NEXT: vand q11, q6, q8
; CHECK-NEON-NEXT: vorn q9, q10, q9
; CHECK-NEON-NEXT: vorn q8, q11, q8
; CHECK-NEON-NEXT: vmovn.i64 d1, q9
; CHECK-NEON-NEXT: vmovn.i64 d0, q8
-; CHECK-NEON-NEXT: vpop {d8, d9, d10, d11}
-; CHECK-NEON-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, pc}
+; CHECK-NEON-NEXT: vpop {d8, d9, d10}
+; CHECK-NEON-NEXT: vpop {d12, d13}
+; CHECK-NEON-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc}
;
; CHECK-FP16-LABEL: utest_f16i32:
; CHECK-FP16: @ %bb.0: @ %entry
; CHECK-FP16-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr}
; CHECK-FP16-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr}
-; CHECK-FP16-NEXT: .vsave {d10, d11, d12, d13}
-; CHECK-FP16-NEXT: vpush {d10, d11, d12, d13}
-; CHECK-FP16-NEXT: .vsave {d8}
-; CHECK-FP16-NEXT: vpush {d8}
-; CHECK-FP16-NEXT: vmov.u16 r0, d0[3]
+; CHECK-FP16-NEXT: .vsave {d8, d9, d10, d11}
+; CHECK-FP16-NEXT: vpush {d8, d9, d10, d11}
+; CHECK-FP16-NEXT: vmov.u16 r0, d0[1]
; CHECK-FP16-NEXT: vorr d8, d0, d0
-; CHECK-FP16-NEXT: vmov.u16 r4, d0[1]
+; CHECK-FP16-NEXT: vmov.u16 r5, d0[3]
; CHECK-FP16-NEXT: vmov s0, r0
; CHECK-FP16-NEXT: bl __fixunshfdi
-; CHECK-FP16-NEXT: mov r6, r0
+; CHECK-FP16-NEXT: mov r4, r0
; CHECK-FP16-NEXT: vmov.u16 r0, d8[0]
-; CHECK-FP16-NEXT: mov r7, r1
-; CHECK-FP16-NEXT: vmov.32 d11[0], r6
+; CHECK-FP16-NEXT: mov r8, r1
+; CHECK-FP16-NEXT: vmov.32 d11[0], r4
; CHECK-FP16-NEXT: vmov s0, r0
; CHECK-FP16-NEXT: bl __fixunshfdi
-; CHECK-FP16-NEXT: vmov s0, r4
-; CHECK-FP16-NEXT: mov r8, r0
+; CHECK-FP16-NEXT: vmov s0, r5
+; CHECK-FP16-NEXT: mov r6, r0
; CHECK-FP16-NEXT: mov r9, r1
-; CHECK-FP16-NEXT: vmov.32 d12[0], r0
+; CHECK-FP16-NEXT: vmov.32 d10[0], r0
; CHECK-FP16-NEXT: bl __fixunshfdi
-; CHECK-FP16-NEXT: mov r4, r0
+; CHECK-FP16-NEXT: mov r5, r0
; CHECK-FP16-NEXT: vmov.u16 r0, d8[2]
-; CHECK-FP16-NEXT: mov r5, r1
-; CHECK-FP16-NEXT: vmov.32 d13[0], r4
+; CHECK-FP16-NEXT: mov r7, r1
+; CHECK-FP16-NEXT: vmov.32 d9[0], r5
; CHECK-FP16-NEXT: vmov s0, r0
; CHECK-FP16-NEXT: bl __fixunshfdi
-; CHECK-FP16-NEXT: vmov.32 d10[0], r0
; CHECK-FP16-NEXT: mvn r3, #0
+; CHECK-FP16-NEXT: vmov.32 d8[0], r0
; CHECK-FP16-NEXT: subs r0, r0, r3
; CHECK-FP16-NEXT: mov r2, #0
; CHECK-FP16-NEXT: sbcs r0, r1, #0
-; CHECK-FP16-NEXT: vmov.32 d11[1], r7
; CHECK-FP16-NEXT: mov r0, #0
; CHECK-FP16-NEXT: movwlo r0, #1
; CHECK-FP16-NEXT: cmp r0, #0
-; CHECK-FP16-NEXT: vmov.32 d10[1], r1
; CHECK-FP16-NEXT: mvnne r0, #0
-; CHECK-FP16-NEXT: subs r1, r6, r3
+; CHECK-FP16-NEXT: subs r1, r5, r3
; CHECK-FP16-NEXT: sbcs r1, r7, #0
-; CHECK-FP16-NEXT: vmov.32 d13[1], r5
; CHECK-FP16-NEXT: mov r1, #0
; CHECK-FP16-NEXT: movwlo r1, #1
; CHECK-FP16-NEXT: cmp r1, #0
; CHECK-FP16-NEXT: mvnne r1, #0
; CHECK-FP16-NEXT: subs r7, r4, r3
-; CHECK-FP16-NEXT: sbcs r7, r5, #0
+; CHECK-FP16-NEXT: sbcs r7, r8, #0
; CHECK-FP16-NEXT: vdup.32 d19, r1
; CHECK-FP16-NEXT: mov r7, #0
; CHECK-FP16-NEXT: vdup.32 d18, r0
; CHECK-FP16-NEXT: movwlo r7, #1
; CHECK-FP16-NEXT: cmp r7, #0
; CHECK-FP16-NEXT: mvnne r7, #0
-; CHECK-FP16-NEXT: subs r3, r8, r3
+; CHECK-FP16-NEXT: subs r3, r6, r3
; CHECK-FP16-NEXT: sbcs r3, r9, #0
-; CHECK-FP16-NEXT: vmov.32 d12[1], r9
+; CHECK-FP16-NEXT: vdup.32 d17, r7
; CHECK-FP16-NEXT: movwlo r2, #1
; CHECK-FP16-NEXT: cmp r2, #0
-; CHECK-FP16-NEXT: vdup.32 d17, r7
; CHECK-FP16-NEXT: mvnne r2, #0
-; CHECK-FP16-NEXT: vand q10, q5, q9
+; CHECK-FP16-NEXT: vand q10, q4, q9
; CHECK-FP16-NEXT: vdup.32 d16, r2
-; CHECK-FP16-NEXT: vand q11, q6, q8
+; CHECK-FP16-NEXT: vand q11, q5, q8
; CHECK-FP16-NEXT: vorn q9, q10, q9
; CHECK-FP16-NEXT: vorn q8, q11, q8
; CHECK-FP16-NEXT: vmovn.i64 d1, q9
; CHECK-FP16-NEXT: vmovn.i64 d0, q8
-; CHECK-FP16-NEXT: vpop {d8}
-; CHECK-FP16-NEXT: vpop {d10, d11, d12, d13}
+; CHECK-FP16-NEXT: vpop {d8, d9, d10, d11}
; CHECK-FP16-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc}
entry:
%conv = fptoui <4 x half> %x to <4 x i64>
@@ -925,8 +902,8 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
; CHECK-NEON-NEXT: push {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEON-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEON-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEON-NEXT: vmov r0, s2
-; CHECK-NEON-NEXT: vmov.f32 s16, s3
+; CHECK-NEON-NEXT: vmov r0, s3
+; CHECK-NEON-NEXT: vmov.f32 s16, s2
; CHECK-NEON-NEXT: vmov.f32 s18, s1
; CHECK-NEON-NEXT: vmov.f32 s20, s0
; CHECK-NEON-NEXT: bl __aeabi_h2f
@@ -935,43 +912,42 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
; CHECK-NEON-NEXT: vmov r0, s16
; CHECK-NEON-NEXT: mov r6, r1
; CHECK-NEON-NEXT: bl __aeabi_h2f
+; CHECK-NEON-NEXT: vmov r8, s20
+; CHECK-NEON-NEXT: vmov.32 d13[0], r5
; CHECK-NEON-NEXT: bl __aeabi_f2lz
-; CHECK-NEON-NEXT: mov r2, r0
-; CHECK-NEON-NEXT: vmov r0, s18
-; CHECK-NEON-NEXT: vmov.32 d17[0], r2
-; CHECK-NEON-NEXT: mvn r8, #0
-; CHECK-NEON-NEXT: subs r2, r2, r8
-; CHECK-NEON-NEXT: vmov r4, s20
-; CHECK-NEON-NEXT: vmov.32 d16[0], r5
-; CHECK-NEON-NEXT: vmov.i64 q5, #0xffffffff
+; CHECK-NEON-NEXT: vmov r2, s18
+; CHECK-NEON-NEXT: vmov.32 d12[0], r0
+; CHECK-NEON-NEXT: mvn r9, #0
+; CHECK-NEON-NEXT: subs r0, r0, r9
+; CHECK-NEON-NEXT: sbcs r0, r1, #0
+; CHECK-NEON-NEXT: vmov.32 d13[1], r6
+; CHECK-NEON-NEXT: mov r0, #0
; CHECK-NEON-NEXT: mov r7, #0
-; CHECK-NEON-NEXT: vmov.32 d17[1], r1
-; CHECK-NEON-NEXT: sbcs r1, r1, #0
+; CHECK-NEON-NEXT: movwlt r0, #1
+; CHECK-NEON-NEXT: cmp r0, #0
+; CHECK-NEON-NEXT: vmov.32 d12[1], r1
+; CHECK-NEON-NEXT: mvnne r0, #0
+; CHECK-NEON-NEXT: subs r1, r5, r9
+; CHECK-NEON-NEXT: sbcs r1, r6, #0
; CHECK-NEON-NEXT: mov r1, #0
; CHECK-NEON-NEXT: movwlt r1, #1
; CHECK-NEON-NEXT: cmp r1, #0
; CHECK-NEON-NEXT: mvnne r1, #0
-; CHECK-NEON-NEXT: subs r2, r5, r8
-; CHECK-NEON-NEXT: sbcs r2, r6, #0
-; CHECK-NEON-NEXT: vdup.32 d19, r1
-; CHECK-NEON-NEXT: mov r2, #0
-; CHECK-NEON-NEXT: vmov.32 d16[1], r6
-; CHECK-NEON-NEXT: movwlt r2, #1
-; CHECK-NEON-NEXT: cmp r2, #0
-; CHECK-NEON-NEXT: mvnne r2, #0
-; CHECK-NEON-NEXT: vdup.32 d18, r2
-; CHECK-NEON-NEXT: vorr q4, q9, q9
-; CHECK-NEON-NEXT: vbsl q4, q8, q5
-; CHECK-NEON-NEXT: vmov r10, r9, d8
+; CHECK-NEON-NEXT: vdup.32 d9, r1
+; CHECK-NEON-NEXT: vdup.32 d8, r0
+; CHECK-NEON-NEXT: mov r0, r2
; CHECK-NEON-NEXT: bl __aeabi_h2f
+; CHECK-NEON-NEXT: vmov.i64 q5, #0xffffffff
+; CHECK-NEON-NEXT: vbsl q4, q6, q5
; CHECK-NEON-NEXT: bl __aeabi_f2lz
; CHECK-NEON-NEXT: mov r5, r0
; CHECK-NEON-NEXT: vmov.32 d13[0], r0
-; CHECK-NEON-NEXT: mov r0, r4
+; CHECK-NEON-NEXT: mov r0, r8
; CHECK-NEON-NEXT: mov r6, r1
+; CHECK-NEON-NEXT: vmov r4, r10, d8
; CHECK-NEON-NEXT: bl __aeabi_h2f
; CHECK-NEON-NEXT: bl __aeabi_f2lz
-; CHECK-NEON-NEXT: subs r2, r5, r8
+; CHECK-NEON-NEXT: subs r2, r5, r9
; CHECK-NEON-NEXT: vmov.32 d12[0], r0
; CHECK-NEON-NEXT: sbcs r2, r6, #0
; CHECK-NEON-NEXT: mov r2, #0
@@ -979,25 +955,25 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
; CHECK-NEON-NEXT: movwlt r2, #1
; CHECK-NEON-NEXT: cmp r2, #0
; CHECK-NEON-NEXT: mvnne r2, #0
-; CHECK-NEON-NEXT: subs r0, r0, r8
+; CHECK-NEON-NEXT: subs r0, r0, r9
; CHECK-NEON-NEXT: sbcs r0, r1, #0
-; CHECK-NEON-NEXT: vmov.32 d12[1], r1
+; CHECK-NEON-NEXT: vdup.32 d17, r2
; CHECK-NEON-NEXT: mov r0, #0
+; CHECK-NEON-NEXT: vmov.32 d12[1], r1
; CHECK-NEON-NEXT: movwlt r0, #1
; CHECK-NEON-NEXT: cmp r0, #0
; CHECK-NEON-NEXT: mvnne r0, #0
-; CHECK-NEON-NEXT: vdup.32 d17, r2
+; CHECK-NEON-NEXT: vmov r2, r3, d9
; CHECK-NEON-NEXT: vdup.32 d16, r0
-; CHECK-NEON-NEXT: vmov r0, r1, d9
+; CHECK-NEON-NEXT: rsbs r6, r4, #0
; CHECK-NEON-NEXT: vbsl q8, q6, q5
-; CHECK-NEON-NEXT: rsbs r6, r10, #0
-; CHECK-NEON-NEXT: rscs r6, r9, #0
+; CHECK-NEON-NEXT: rscs r6, r10, #0
; CHECK-NEON-NEXT: mov r6, #0
-; CHECK-NEON-NEXT: vmov r2, r3, d17
; CHECK-NEON-NEXT: movwlt r6, #1
-; CHECK-NEON-NEXT: vmov r5, r4, d16
; CHECK-NEON-NEXT: cmp r6, #0
+; CHECK-NEON-NEXT: vmov r0, r1, d17
; CHECK-NEON-NEXT: mvnne r6, #0
+; CHECK-NEON-NEXT: vmov r5, r4, d16
; CHECK-NEON-NEXT: rsbs r0, r0, #0
; CHECK-NEON-NEXT: rscs r0, r1, #0
; CHECK-NEON-NEXT: mov r0, #0
@@ -1006,20 +982,20 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
; CHECK-NEON-NEXT: mvnne r0, #0
; CHECK-NEON-NEXT: rsbs r1, r2, #0
; CHECK-NEON-NEXT: rscs r1, r3, #0
-; CHECK-NEON-NEXT: vdup.32 d21, r0
+; CHECK-NEON-NEXT: vmov.32 d19[0], r0
; CHECK-NEON-NEXT: mov r1, #0
-; CHECK-NEON-NEXT: vdup.32 d20, r6
; CHECK-NEON-NEXT: movwlt r1, #1
; CHECK-NEON-NEXT: cmp r1, #0
; CHECK-NEON-NEXT: mvnne r1, #0
-; CHECK-NEON-NEXT: rsbs r2, r5, #0
-; CHECK-NEON-NEXT: rscs r2, r4, #0
-; CHECK-NEON-NEXT: vdup.32 d19, r1
+; CHECK-NEON-NEXT: rsbs r0, r5, #0
+; CHECK-NEON-NEXT: rscs r0, r4, #0
+; CHECK-NEON-NEXT: vmov.32 d21[0], r1
; CHECK-NEON-NEXT: movwlt r7, #1
; CHECK-NEON-NEXT: cmp r7, #0
+; CHECK-NEON-NEXT: vmov.32 d20[0], r6
; CHECK-NEON-NEXT: mvnne r7, #0
+; CHECK-NEON-NEXT: vmov.32 d18[0], r7
; CHECK-NEON-NEXT: vand q10, q10, q4
-; CHECK-NEON-NEXT: vdup.32 d18, r7
; CHECK-NEON-NEXT: vand q8, q9, q8
; CHECK-NEON-NEXT: vmovn.i64 d1, q10
; CHECK-NEON-NEXT: vmovn.i64 d0, q8
@@ -1028,78 +1004,78 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
;
; CHECK-FP16-LABEL: ustest_f16i32:
; CHECK-FP16: @ %bb.0: @ %entry
-; CHECK-FP16-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr}
-; CHECK-FP16-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr}
+; CHECK-FP16-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-FP16-NEXT: push {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-FP16-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-FP16-NEXT: vpush {d8, d9, d10, d11, d12, d13}
; CHECK-FP16-NEXT: vmov.u16 r0, d0[3]
; CHECK-FP16-NEXT: vorr d8, d0, d0
-; CHECK-FP16-NEXT: vmov.u16 r8, d0[1]
+; CHECK-FP16-NEXT: vmov.u16 r8, d0[0]
+; CHECK-FP16-NEXT: vmov.u16 r9, d0[1]
; CHECK-FP16-NEXT: vmov s0, r0
; CHECK-FP16-NEXT: bl __fixhfdi
; CHECK-FP16-NEXT: mov r4, r0
; CHECK-FP16-NEXT: vmov.u16 r0, d8[2]
; CHECK-FP16-NEXT: mov r5, r1
-; CHECK-FP16-NEXT: vmov.32 d11[0], r4
+; CHECK-FP16-NEXT: vmov.32 d9[0], r4
; CHECK-FP16-NEXT: vmov s0, r0
; CHECK-FP16-NEXT: bl __fixhfdi
-; CHECK-FP16-NEXT: mvn r7, #0
-; CHECK-FP16-NEXT: subs r2, r4, r7
+; CHECK-FP16-NEXT: mvn r10, #0
+; CHECK-FP16-NEXT: subs r2, r4, r10
; CHECK-FP16-NEXT: sbcs r2, r5, #0
-; CHECK-FP16-NEXT: vmov.32 d10[0], r0
+; CHECK-FP16-NEXT: vmov.32 d8[0], r0
; CHECK-FP16-NEXT: mov r2, #0
-; CHECK-FP16-NEXT: vmov.i64 q6, #0xffffffff
+; CHECK-FP16-NEXT: vmov s0, r9
; CHECK-FP16-NEXT: movwlt r2, #1
; CHECK-FP16-NEXT: cmp r2, #0
; CHECK-FP16-NEXT: mvnne r2, #0
-; CHECK-FP16-NEXT: subs r0, r0, r7
+; CHECK-FP16-NEXT: subs r0, r0, r10
; CHECK-FP16-NEXT: sbcs r0, r1, #0
-; CHECK-FP16-NEXT: vmov.32 d11[1], r5
+; CHECK-FP16-NEXT: vmov.32 d9[1], r5
; CHECK-FP16-NEXT: mov r0, #0
-; CHECK-FP16-NEXT: vmov s0, r8
+; CHECK-FP16-NEXT: vmov.i64 q5, #0xffffffff
; CHECK-FP16-NEXT: movwlt r0, #1
; CHECK-FP16-NEXT: cmp r0, #0
-; CHECK-FP16-NEXT: vmov.32 d10[1], r1
+; CHECK-FP16-NEXT: vmov.32 d8[1], r1
; CHECK-FP16-NEXT: mvnne r0, #0
; CHECK-FP16-NEXT: mov r6, #0
; CHECK-FP16-NEXT: vdup.32 d17, r2
; CHECK-FP16-NEXT: vdup.32 d16, r0
-; CHECK-FP16-NEXT: vbif q5, q6, q8
-; CHECK-FP16-NEXT: vmov r9, r8, d10
+; CHECK-FP16-NEXT: vbif q4, q5, q8
; CHECK-FP16-NEXT: bl __fixhfdi
+; CHECK-FP16-NEXT: vmov s0, r8
; CHECK-FP16-NEXT: mov r4, r0
-; CHECK-FP16-NEXT: vmov.u16 r0, d8[0]
; CHECK-FP16-NEXT: mov r5, r1
-; CHECK-FP16-NEXT: vmov.32 d9[0], r4
-; CHECK-FP16-NEXT: vmov s0, r0
+; CHECK-FP16-NEXT: vmov.32 d13[0], r0
+; CHECK-FP16-NEXT: vmov r7, r8, d8
; CHECK-FP16-NEXT: bl __fixhfdi
-; CHECK-FP16-NEXT: subs r2, r4, r7
-; CHECK-FP16-NEXT: vmov.32 d8[0], r0
+; CHECK-FP16-NEXT: subs r2, r4, r10
+; CHECK-FP16-NEXT: vmov.32 d12[0], r0
; CHECK-FP16-NEXT: sbcs r2, r5, #0
; CHECK-FP16-NEXT: mov r2, #0
-; CHECK-FP16-NEXT: vmov.32 d9[1], r5
+; CHECK-FP16-NEXT: vmov.32 d13[1], r5
; CHECK-FP16-NEXT: movwlt r2, #1
; CHECK-FP16-NEXT: cmp r2, #0
; CHECK-FP16-NEXT: mvnne r2, #0
-; CHECK-FP16-NEXT: subs r0, r0, r7
+; CHECK-FP16-NEXT: subs r0, r0, r10
; CHECK-FP16-NEXT: sbcs r0, r1, #0
-; CHECK-FP16-NEXT: vmov.32 d8[1], r1
+; CHECK-FP16-NEXT: vdup.32 d17, r2
; CHECK-FP16-NEXT: mov r0, #0
+; CHECK-FP16-NEXT: vmov.32 d12[1], r1
; CHECK-FP16-NEXT: movwlt r0, #1
; CHECK-FP16-NEXT: cmp r0, #0
; CHECK-FP16-NEXT: mvnne r0, #0
-; CHECK-FP16-NEXT: vdup.32 d17, r2
+; CHECK-FP16-NEXT: vmov r2, r3, d9
; CHECK-FP16-NEXT: vdup.32 d16, r0
-; CHECK-FP16-NEXT: vmov r0, r1, d11
-; CHECK-FP16-NEXT: vbsl q8, q4, q6
-; CHECK-FP16-NEXT: rsbs r7, r9, #0
+; CHECK-FP16-NEXT: rsbs r7, r7, #0
+; CHECK-FP16-NEXT: vbsl q8, q6, q5
; CHECK-FP16-NEXT: rscs r7, r8, #0
; CHECK-FP16-NEXT: mov r7, #0
-; CHECK-FP16-NEXT: vmov r2, r3, d17
; CHECK-FP16-NEXT: movwlt r7, #1
-; CHECK-FP16-NEXT: vmov r5, r4, d16
; CHECK-FP16-NEXT: cmp r7, #0
+; CHECK-FP16-NEXT: vmov r0, r1, d17
; CHECK-FP16-NEXT: mvnne r7, #0
+; CHECK-FP16-NEXT: vmov r5, r4, d16
; CHECK-FP16-NEXT: rsbs r0, r0, #0
; CHECK-FP16-NEXT: rscs r0, r1, #0
; CHECK-FP16-NEXT: mov r0, #0
@@ -1108,25 +1084,25 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
; CHECK-FP16-NEXT: mvnne r0, #0
; CHECK-FP16-NEXT: rsbs r1, r2, #0
; CHECK-FP16-NEXT: rscs r1, r3, #0
-; CHECK-FP16-NEXT: vdup.32 d21, r0
+; CHECK-FP16-NEXT: vmov.32 d19[0], r0
; CHECK-FP16-NEXT: mov r1, #0
-; CHECK-FP16-NEXT: vdup.32 d20, r7
; CHECK-FP16-NEXT: movwlt r1, #1
; CHECK-FP16-NEXT: cmp r1, #0
; CHECK-FP16-NEXT: mvnne r1, #0
-; CHECK-FP16-NEXT: rsbs r2, r5, #0
-; CHECK-FP16-NEXT: rscs r2, r4, #0
-; CHECK-FP16-NEXT: vdup.32 d19, r1
+; CHECK-FP16-NEXT: rsbs r0, r5, #0
+; CHECK-FP16-NEXT: rscs r0, r4, #0
+; CHECK-FP16-NEXT: vmov.32 d21[0], r1
; CHECK-FP16-NEXT: movwlt r6, #1
; CHECK-FP16-NEXT: cmp r6, #0
+; CHECK-FP16-NEXT: vmov.32 d20[0], r7
; CHECK-FP16-NEXT: mvnne r6, #0
-; CHECK-FP16-NEXT: vand q10, q10, q5
-; CHECK-FP16-NEXT: vdup.32 d18, r6
+; CHECK-FP16-NEXT: vmov.32 d18[0], r6
+; CHECK-FP16-NEXT: vand q10, q10, q4
; CHECK-FP16-NEXT: vand q8, q9, q8
; CHECK-FP16-NEXT: vmovn.i64 d1, q10
; CHECK-FP16-NEXT: vmovn.i64 d0, q8
; CHECK-FP16-NEXT: vpop {d8, d9, d10, d11, d12, d13}
-; CHECK-FP16-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc}
+; CHECK-FP16-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, pc}
entry:
%conv = fptosi <4 x half> %x to <4 x i64>
%0 = icmp slt <4 x i64> %conv, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
diff --git a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll
index 0134ee48ad421..742f2a75a1aa8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll
@@ -279,42 +279,40 @@ define arm_aapcs_vfpcc <4 x i32> @ustest_f16i32(<4 x half> %x) {
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vmov.u16 r0, q0[2]
+; CHECK-NEXT: vmov.u16 r0, q0[3]
; CHECK-NEXT: vmov q4, q0
; CHECK-NEXT: bl __fixhfdi
; CHECK-NEXT: mov r4, r0
-; CHECK-NEXT: vmov.u16 r0, q4[3]
+; CHECK-NEXT: vmov.u16 r0, q4[2]
; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: bl __fixhfdi
-; CHECK-NEXT: rsbs r2, r4, #0
+; CHECK-NEXT: rsbs r2, r0, #0
; CHECK-NEXT: mov.w r6, #0
-; CHECK-NEXT: sbcs.w r2, r6, r5
-; CHECK-NEXT: vmov q0[2], q0[0], r4, r0
-; CHECK-NEXT: csetm r2, lt
-; CHECK-NEXT: rsbs r0, r0, #0
-; CHECK-NEXT: mov.w r3, #0
+; CHECK-NEXT: vmov q0[2], q0[0], r0, r4
; CHECK-NEXT: sbcs.w r0, r6, r1
-; CHECK-NEXT: bfi r3, r2, #0, #8
; CHECK-NEXT: csetm r0, lt
-; CHECK-NEXT: bfi r3, r0, #8, #8
-; CHECK-NEXT: vmov.u16 r0, q4[0]
+; CHECK-NEXT: movs r1, #0
+; CHECK-NEXT: bfi r1, r0, #0, #8
+; CHECK-NEXT: rsbs r0, r4, #0
+; CHECK-NEXT: sbcs.w r0, r6, r5
; CHECK-NEXT: vmov.i32 q5, #0x0
-; CHECK-NEXT: vmov q0[3], q0[1], r5, r1
-; CHECK-NEXT: vmsr p0, r3
+; CHECK-NEXT: csetm r0, lt
+; CHECK-NEXT: bfi r1, r0, #8, #8
+; CHECK-NEXT: vmov.u16 r0, q4[1]
+; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpsel q6, q0, q5
; CHECK-NEXT: bl __fixhfdi
; CHECK-NEXT: mov r4, r0
-; CHECK-NEXT: vmov.u16 r0, q4[1]
+; CHECK-NEXT: vmov.u16 r0, q4[0]
; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: bl __fixhfdi
-; CHECK-NEXT: rsbs r2, r4, #0
-; CHECK-NEXT: vmov q0[2], q0[0], r4, r0
-; CHECK-NEXT: sbcs.w r2, r6, r5
-; CHECK-NEXT: vmov q0[3], q0[1], r5, r1
-; CHECK-NEXT: csetm r2, lt
-; CHECK-NEXT: rsbs r0, r0, #0
+; CHECK-NEXT: rsbs r2, r0, #0
+; CHECK-NEXT: vmov q0[2], q0[0], r0, r4
; CHECK-NEXT: sbcs.w r0, r6, r1
-; CHECK-NEXT: bfi r6, r2, #0, #8
+; CHECK-NEXT: csetm r0, lt
+; CHECK-NEXT: rsbs r1, r4, #0
+; CHECK-NEXT: sbcs.w r1, r6, r5
+; CHECK-NEXT: bfi r6, r0, #0, #8
; CHECK-NEXT: csetm r0, lt
; CHECK-NEXT: bfi r6, r0, #8, #8
; CHECK-NEXT: vmsr p0, r6
@@ -1353,42 +1351,40 @@ define arm_aapcs_vfpcc <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vmov.u16 r0, q0[2]
+; CHECK-NEXT: vmov.u16 r0, q0[3]
; CHECK-NEXT: vmov q4, q0
; CHECK-NEXT: bl __fixhfdi
; CHECK-NEXT: mov r4, r0
-; CHECK-NEXT: vmov.u16 r0, q4[3]
+; CHECK-NEXT: vmov.u16 r0, q4[2]
; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: bl __fixhfdi
-; CHECK-NEXT: rsbs r2, r4, #0
+; CHECK-NEXT: rsbs r2, r0, #0
; CHECK-NEXT: mov.w r6, #0
-; CHECK-NEXT: sbcs.w r2, r6, r5
-; CHECK-NEXT: vmov q0[2], q0[0], r4, r0
-; CHECK-NEXT: csetm r2, lt
-; CHECK-NEXT: rsbs r0, r0, #0
-; CHECK-NEXT: mov.w r3, #0
+; CHECK-NEXT: vmov q0[2], q0[0], r0, r4
; CHECK-NEXT: sbcs.w r0, r6, r1
-; CHECK-NEXT: bfi r3, r2, #0, #8
; CHECK-NEXT: csetm r0, lt
-; CHECK-NEXT: bfi r3, r0, #8, #8
-; CHECK-NEXT: vmov.u16 r0, q4[0]
+; CHECK-NEXT: movs r1, #0
+; CHECK-NEXT: bfi r1, r0, #0, #8
+; CHECK-NEXT: rsbs r0, r4, #0
+; CHECK-NEXT: sbcs.w r0, r6, r5
; CHECK-NEXT: vmov.i32 q5, #0x0
-; CHECK-NEXT: vmov q0[3], q0[1], r5, r1
-; CHECK-NEXT: vmsr p0, r3
+; CHECK-NEXT: csetm r0, lt
+; CHECK-NEXT: bfi r1, r0, #8, #8
+; CHECK-NEXT: vmov.u16 r0, q4[1]
+; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpsel q6, q0, q5
; CHECK-NEXT: bl __fixhfdi
; CHECK-NEXT: mov r4, r0
-; CHECK-NEXT: vmov.u16 r0, q4[1]
+; CHECK-NEXT: vmov.u16 r0, q4[0]
; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: bl __fixhfdi
-; CHECK-NEXT: rsbs r2, r4, #0
-; CHECK-NEXT: vmov q0[2], q0[0], r4, r0
-; CHECK-NEXT: sbcs.w r2, r6, r5
-; CHECK-NEXT: vmov q0[3], q0[1], r5, r1
-; CHECK-NEXT: csetm r2, lt
-; CHECK-NEXT: rsbs r0, r0, #0
+; CHECK-NEXT: rsbs r2, r0, #0
+; CHECK-NEXT: vmov q0[2], q0[0], r0, r4
; CHECK-NEXT: sbcs.w r0, r6, r1
-; CHECK-NEXT: bfi r6, r2, #0, #8
+; CHECK-NEXT: csetm r0, lt
+; CHECK-NEXT: rsbs r1, r4, #0
+; CHECK-NEXT: sbcs.w r1, r6, r5
+; CHECK-NEXT: bfi r6, r0, #0, #8
; CHECK-NEXT: csetm r0, lt
; CHECK-NEXT: bfi r6, r0, #8, #8
; CHECK-NEXT: vmsr p0, r6
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll
index 5ac88c581f33c..b4a2aa7a1ed1b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll
@@ -33,7 +33,12 @@ define arm_aapcs_vfpcc <2 x i8> @unscaled_v2i8_i8(ptr %base, ptr %offptr) {
; CHECK-LABEL: unscaled_v2i8_i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldrb r2, [r1]
+; CHECK-NEXT: vmov.i32 q0, #0xff
; CHECK-NEXT: ldrb r1, [r1, #1]
+; CHECK-NEXT: vmov q1[2], q1[0], r2, r1
+; CHECK-NEXT: vand q0, q1, q0
+; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: ldrb r1, [r0, r1]
; CHECK-NEXT: ldrb r0, [r0, r2]
; CHECK-NEXT: vmov q0[2], q0[0], r0, r1
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
index fe5d7f29f01ff..acbe48f9e5927 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
@@ -302,37 +302,35 @@ define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b)
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT: vmov.f32 s12, s2
+; CHECK-NEXT: vmov.f32 s8, s2
; CHECK-NEXT: vmov.f32 s2, s3
; CHECK-NEXT: vmov.f32 s10, s7
+; CHECK-NEXT: vmov r10, s8
; CHECK-NEXT: vmov.f32 s8, s6
; CHECK-NEXT: vmov.f32 s6, s5
-; CHECK-NEXT: vmov r10, s2
-; CHECK-NEXT: vmov r2, s10
-; CHECK-NEXT: vmov.f32 s2, s1
-; CHECK-NEXT: adds.w r6, r10, r2
+; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: asr.w r0, r10, #31
+; CHECK-NEXT: adds.w r6, r10, r2
+; CHECK-NEXT: eor.w r7, r10, r2
; CHECK-NEXT: adc r3, r0, #0
-; CHECK-NEXT: eor.w r1, r10, r2
; CHECK-NEXT: asrl r6, r3, r2
; CHECK-NEXT: subs r0, r6, r2
-; CHECK-NEXT: vmov r6, s12
+; CHECK-NEXT: vmov r6, s2
; CHECK-NEXT: sbc lr, r3, #0
-; CHECK-NEXT: vmov r3, s8
+; CHECK-NEXT: vmov r3, s10
+; CHECK-NEXT: vmov.f32 s2, s1
; CHECK-NEXT: umull r0, r8, r0, r2
-; CHECK-NEXT: vmov.i64 q3, #0xffffffff
-; CHECK-NEXT: vand q2, q2, q3
+; CHECK-NEXT: asrs r5, r6, #31
; CHECK-NEXT: adds r4, r6, r3
-; CHECK-NEXT: asr.w r7, r6, #31
-; CHECK-NEXT: adc r5, r7, #0
-; CHECK-NEXT: eor.w r7, r6, r3
+; CHECK-NEXT: adc r5, r5, #0
+; CHECK-NEXT: eor.w r1, r6, r3
; CHECK-NEXT: asrl r4, r5, r3
; CHECK-NEXT: subs r4, r4, r3
; CHECK-NEXT: sbc r5, r5, #0
-; CHECK-NEXT: orrs.w r7, r7, r6, asr #31
+; CHECK-NEXT: orrs.w r7, r7, r10, asr #31
; CHECK-NEXT: umull r4, r12, r4, r3
; CHECK-NEXT: csetm r9, eq
-; CHECK-NEXT: orrs.w r1, r1, r10, asr #31
+; CHECK-NEXT: orrs.w r1, r1, r6, asr #31
; CHECK-NEXT: mov.w r7, #0
; CHECK-NEXT: csetm r1, eq
; CHECK-NEXT: bfi r7, r9, #0, #8
@@ -345,49 +343,47 @@ define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b)
; CHECK-NEXT: rsb.w r1, r10, #0
; CHECK-NEXT: lsll r4, r5, r3
; CHECK-NEXT: lsll r0, r7, r1
+; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: vmov r1, s6
; CHECK-NEXT: lsll r0, r7, r2
+; CHECK-NEXT: vmov q3[2], q3[0], r0, r4
; CHECK-NEXT: mov.w r12, #0
-; CHECK-NEXT: vmov q3[2], q3[0], r4, r0
-; CHECK-NEXT: vmov r0, s2
-; CHECK-NEXT: vmov q3[3], q3[1], r5, r7
; CHECK-NEXT: vpsel q2, q3, q2
-; CHECK-NEXT: adds r4, r0, r1
-; CHECK-NEXT: asr.w r2, r0, #31
-; CHECK-NEXT: adc r3, r2, #0
-; CHECK-NEXT: asrl r4, r3, r1
-; CHECK-NEXT: subs r2, r4, r1
-; CHECK-NEXT: vmov r4, s0
-; CHECK-NEXT: sbc r8, r3, #0
-; CHECK-NEXT: vmov r3, s4
-; CHECK-NEXT: umull r2, lr, r2, r1
-; CHECK-NEXT: adds r6, r4, r3
-; CHECK-NEXT: asr.w r5, r4, #31
+; CHECK-NEXT: adds r2, r3, r1
+; CHECK-NEXT: asr.w r0, r3, #31
+; CHECK-NEXT: adc r5, r0, #0
+; CHECK-NEXT: asrl r2, r5, r1
+; CHECK-NEXT: subs r0, r2, r1
+; CHECK-NEXT: vmov r2, s0
+; CHECK-NEXT: sbc r8, r5, #0
+; CHECK-NEXT: umull r4, lr, r0, r1
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: asrs r5, r2, #31
+; CHECK-NEXT: adds r6, r2, r0
; CHECK-NEXT: adc r7, r5, #0
; CHECK-NEXT: mla r5, r8, r1, lr
-; CHECK-NEXT: asrl r6, r7, r3
-; CHECK-NEXT: subs.w r8, r6, r3
-; CHECK-NEXT: eor.w r6, r4, r3
+; CHECK-NEXT: asrl r6, r7, r0
+; CHECK-NEXT: subs.w r8, r6, r0
+; CHECK-NEXT: eor.w r6, r2, r0
; CHECK-NEXT: sbc lr, r7, #0
-; CHECK-NEXT: eor.w r7, r0, r1
-; CHECK-NEXT: orrs.w r6, r6, r4, asr #31
-; CHECK-NEXT: orr.w r7, r7, r0, asr #31
+; CHECK-NEXT: eor.w r7, r3, r1
+; CHECK-NEXT: orrs.w r6, r6, r2, asr #31
+; CHECK-NEXT: orr.w r7, r7, r3, asr #31
; CHECK-NEXT: csetm r6, eq
; CHECK-NEXT: cmp r7, #0
; CHECK-NEXT: bfi r12, r6, #0, #8
; CHECK-NEXT: csetm r6, eq
; CHECK-NEXT: bfi r12, r6, #8, #8
-; CHECK-NEXT: umull r6, r7, r8, r3
-; CHECK-NEXT: rsbs r0, r0, #0
-; CHECK-NEXT: lsll r2, r5, r0
-; CHECK-NEXT: rsbs r0, r4, #0
-; CHECK-NEXT: mla r7, lr, r3, r7
-; CHECK-NEXT: lsll r2, r5, r1
+; CHECK-NEXT: umull r6, r7, r8, r0
+; CHECK-NEXT: rsb.w r8, r3, #0
+; CHECK-NEXT: lsll r4, r5, r8
; CHECK-NEXT: vmsr p0, r12
-; CHECK-NEXT: lsll r6, r7, r0
-; CHECK-NEXT: lsll r6, r7, r3
-; CHECK-NEXT: vmov q0[2], q0[0], r6, r2
-; CHECK-NEXT: vmov q0[3], q0[1], r7, r5
+; CHECK-NEXT: mla r3, lr, r0, r7
+; CHECK-NEXT: lsll r4, r5, r1
+; CHECK-NEXT: rsbs r1, r2, #0
+; CHECK-NEXT: lsll r6, r3, r1
+; CHECK-NEXT: lsll r6, r3, r0
+; CHECK-NEXT: vmov q0[2], q0[0], r6, r4
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmov.f32 s1, s2
; CHECK-NEXT: vmov.f32 s2, s8
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
index b8d9670710a00..0bec2b100911c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
@@ -420,7 +420,6 @@ define arm_aapcs_vfpcc <2 x i64> @zext_v2i1_v2f64(<2 x double> %src) {
; CHECK-MVE-NEXT: cmp r0, #0
; CHECK-MVE-NEXT: csetm r0, eq
; CHECK-MVE-NEXT: vmov q0[2], q0[0], r0, r6
-; CHECK-MVE-NEXT: vmov q0[3], q0[1], r0, r6
; CHECK-MVE-NEXT: vand q0, q0, q4
; CHECK-MVE-NEXT: vpop {d8, d9}
; CHECK-MVE-NEXT: pop {r4, r5, r6, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
index 5972a9a7cf934..29b56639bd769 100644
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -11,63 +11,59 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: beq.w .LBB0_8
; CHECK-NEXT: @ %bb.1: @ %entry
-; CHECK-NEXT: mov r5, r2
+; CHECK-NEXT: mov r11, r2
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: bne .LBB0_3
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: mov r12, r0
-; CHECK-NEXT: mov r11, r1
-; CHECK-NEXT: mov r10, r5
+; CHECK-NEXT: mov r8, r1
+; CHECK-NEXT: mov r10, r11
; CHECK-NEXT: b .LBB0_6
; CHECK-NEXT: .LBB0_3: @ %vector.ph
; CHECK-NEXT: bic r2, r3, #1
; CHECK-NEXT: adr r4, .LCPI0_0
; CHECK-NEXT: subs r7, r2, #2
; CHECK-NEXT: movs r6, #1
-; CHECK-NEXT: vldrw.u32 q0, [r4]
-; CHECK-NEXT: adr r4, .LCPI0_1
; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: add.w r10, r11, r2, lsl #2
; CHECK-NEXT: add.w lr, r6, r7, lsr #1
; CHECK-NEXT: str r2, [sp] @ 4-byte Spill
-; CHECK-NEXT: add.w r10, r5, r2, lsl #2
-; CHECK-NEXT: add.w r11, r1, r2, lsl #2
+; CHECK-NEXT: add.w r8, r1, r2, lsl #2
; CHECK-NEXT: add.w r12, r0, r2, lsl #2
-; CHECK-NEXT: vldrw.u32 q1, [r4]
+; CHECK-NEXT: vldrw.u32 q0, [r4]
+; CHECK-NEXT: vmvn.i32 q1, #0x80000000
; CHECK-NEXT: .LBB0_4: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldrd r4, r2, [r0], #8
-; CHECK-NEXT: mov.w r3, #-1
-; CHECK-NEXT: ldrd r7, r8, [r1], #8
+; CHECK-NEXT: movs r5, #0
+; CHECK-NEXT: ldrd r7, r6, [r1], #8
; CHECK-NEXT: smull r4, r7, r7, r4
; CHECK-NEXT: asrl r4, r7, #31
; CHECK-NEXT: rsbs.w r9, r4, #-2147483648
-; CHECK-NEXT: sbcs r3, r7
-; CHECK-NEXT: mov r9, r5
+; CHECK-NEXT: mov.w r9, #-1
+; CHECK-NEXT: sbcs.w r3, r9, r7
; CHECK-NEXT: csetm r3, lt
-; CHECK-NEXT: movs r5, #0
; CHECK-NEXT: bfi r5, r3, #0, #8
-; CHECK-NEXT: smull r2, r3, r8, r2
+; CHECK-NEXT: smull r2, r3, r6, r2
; CHECK-NEXT: asrl r2, r3, #31
; CHECK-NEXT: rsbs.w r6, r2, #-2147483648
; CHECK-NEXT: vmov q2[2], q2[0], r4, r2
-; CHECK-NEXT: mov.w r6, #-1
+; CHECK-NEXT: sbcs.w r6, r9, r3
; CHECK-NEXT: vmov q2[3], q2[1], r7, r3
-; CHECK-NEXT: sbcs r6, r3
; CHECK-NEXT: csetm r6, lt
; CHECK-NEXT: bfi r5, r6, #8, #8
-; CHECK-NEXT: mvn r6, #-2147483648
; CHECK-NEXT: vmsr p0, r5
-; CHECK-NEXT: mov r5, r9
+; CHECK-NEXT: mvn r5, #-2147483648
; CHECK-NEXT: vpsel q2, q2, q0
; CHECK-NEXT: vmov r2, r3, d4
-; CHECK-NEXT: subs r2, r2, r6
+; CHECK-NEXT: subs r2, r2, r5
; CHECK-NEXT: sbcs r2, r3, #0
; CHECK-NEXT: mov.w r3, #0
; CHECK-NEXT: csetm r2, lt
; CHECK-NEXT: bfi r3, r2, #0, #8
; CHECK-NEXT: vmov r2, r4, d5
-; CHECK-NEXT: subs r2, r2, r6
+; CHECK-NEXT: subs r2, r2, r5
; CHECK-NEXT: sbcs r2, r4, #0
; CHECK-NEXT: csetm r2, lt
; CHECK-NEXT: bfi r3, r2, #8, #8
@@ -75,8 +71,7 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no
; CHECK-NEXT: vpsel q2, q2, q1
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: vmov r3, s8
-; CHECK-NEXT: strd r3, r2, [r5]
-; CHECK-NEXT: add.w r5, r9, #8
+; CHECK-NEXT: strd r3, r2, [r11], #8
; CHECK-NEXT: le lr, .LBB0_4
; CHECK-NEXT: @ %bb.5: @ %middle.block
; CHECK-NEXT: ldrd r2, r3, [sp] @ 8-byte Folded Reload
@@ -90,7 +85,7 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no
; CHECK-NEXT: .LBB0_7: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r2, [r12], #4
-; CHECK-NEXT: ldr r4, [r11], #4
+; CHECK-NEXT: ldr r4, [r8], #4
; CHECK-NEXT: smull r2, r5, r4, r2
; CHECK-NEXT: asrl r2, r5, #31
; CHECK-NEXT: subs r4, r1, r2
@@ -112,11 +107,6 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no
; CHECK-NEXT: .long 4294967295 @ 0xffffffff
; CHECK-NEXT: .long 2147483648 @ 0x80000000
; CHECK-NEXT: .long 4294967295 @ 0xffffffff
-; CHECK-NEXT: .LCPI0_1:
-; CHECK-NEXT: .long 2147483647 @ 0x7fffffff
-; CHECK-NEXT: .long 0 @ 0x0
-; CHECK-NEXT: .long 2147483647 @ 0x7fffffff
-; CHECK-NEXT: .long 0 @ 0x0
entry:
switch i32 %N, label %vector.ph [
i32 0, label %for.cond.cleanup
@@ -613,57 +603,56 @@ define arm_aapcs_vfpcc void @usatmul_2_q31(ptr nocapture readonly %pSrcA, ptr no
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT: .pad #8
-; CHECK-NEXT: sub sp, #8
+; CHECK-NEXT: .pad #4
+; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: beq .LBB3_8
; CHECK-NEXT: @ %bb.1: @ %entry
+; CHECK-NEXT: mov r8, r2
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: bne .LBB3_3
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: movs r7, #0
; CHECK-NEXT: mov r12, r0
-; CHECK-NEXT: mov r10, r1
-; CHECK-NEXT: mov r11, r2
+; CHECK-NEXT: mov r11, r1
+; CHECK-NEXT: mov r2, r8
; CHECK-NEXT: b .LBB3_6
; CHECK-NEXT: .LBB3_3: @ %vector.ph
-; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: bic r3, r3, #1
-; CHECK-NEXT: subs r7, r3, #2
+; CHECK-NEXT: bic r5, r3, #1
; CHECK-NEXT: movs r6, #1
-; CHECK-NEXT: str r3, [sp] @ 4-byte Spill
-; CHECK-NEXT: add.w r11, r2, r3, lsl #2
+; CHECK-NEXT: subs r7, r5, #2
+; CHECK-NEXT: str r5, [sp] @ 4-byte Spill
+; CHECK-NEXT: add.w r2, r8, r5, lsl #2
+; CHECK-NEXT: add.w r11, r1, r5, lsl #2
; CHECK-NEXT: add.w lr, r6, r7, lsr #1
-; CHECK-NEXT: add.w r10, r1, r3, lsl #2
-; CHECK-NEXT: add.w r12, r0, r3, lsl #2
-; CHECK-NEXT: vmov.i64 q0, #0xffffffff
+; CHECK-NEXT: add.w r12, r0, r5, lsl #2
+; CHECK-NEXT: vmov.i8 q0, #0xff
; CHECK-NEXT: .LBB3_4: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldrd r4, r6, [r0], #8
-; CHECK-NEXT: mov.w r8, #0
-; CHECK-NEXT: ldrd r7, r3, [r1], #8
-; CHECK-NEXT: umull r4, r9, r7, r4
-; CHECK-NEXT: lsrl r4, r9, #31
-; CHECK-NEXT: subs.w r5, r4, #-1
-; CHECK-NEXT: sbcs r5, r9, #0
+; CHECK-NEXT: ldrd r4, r9, [r0], #8
+; CHECK-NEXT: ldrd r5, r10, [r1], #8
+; CHECK-NEXT: umull r4, r5, r5, r4
+; CHECK-NEXT: lsrl r4, r5, #31
+; CHECK-NEXT: subs.w r6, r4, #-1
+; CHECK-NEXT: sbcs r5, r5, #0
+; CHECK-NEXT: mov.w r6, #0
; CHECK-NEXT: csetm r5, lo
-; CHECK-NEXT: bfi r8, r5, #0, #8
-; CHECK-NEXT: umull r6, r5, r3, r6
-; CHECK-NEXT: lsrl r6, r5, #31
-; CHECK-NEXT: subs.w r7, r6, #-1
-; CHECK-NEXT: vmov q1[2], q1[0], r4, r6
-; CHECK-NEXT: sbcs r3, r5, #0
-; CHECK-NEXT: vmov q1[3], q1[1], r9, r5
-; CHECK-NEXT: csetm r3, lo
-; CHECK-NEXT: bfi r8, r3, #8, #8
-; CHECK-NEXT: vmsr p0, r8
+; CHECK-NEXT: bfi r6, r5, #0, #8
+; CHECK-NEXT: umull r10, r5, r10, r9
+; CHECK-NEXT: lsrl r10, r5, #31
+; CHECK-NEXT: subs.w r7, r10, #-1
+; CHECK-NEXT: vmov q1[2], q1[0], r4, r10
+; CHECK-NEXT: sbcs r5, r5, #0
+; CHECK-NEXT: csetm r5, lo
+; CHECK-NEXT: bfi r6, r5, #8, #8
+; CHECK-NEXT: vmsr p0, r6
; CHECK-NEXT: vpsel q1, q1, q0
-; CHECK-NEXT: vmov r3, s6
-; CHECK-NEXT: vmov r4, s4
-; CHECK-NEXT: strd r4, r3, [r2], #8
+; CHECK-NEXT: vmov r4, s6
+; CHECK-NEXT: vmov r5, s4
+; CHECK-NEXT: strd r5, r4, [r8], #8
; CHECK-NEXT: le lr, .LBB3_4
; CHECK-NEXT: @ %bb.5: @ %middle.block
-; CHECK-NEXT: ldrd r7, r3, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT: ldr r7, [sp] @ 4-byte Reload
; CHECK-NEXT: cmp r7, r3
; CHECK-NEXT: beq .LBB3_8
; CHECK-NEXT: .LBB3_6: @ %for.body.preheader
@@ -671,17 +660,17 @@ define arm_aapcs_vfpcc void @usatmul_2_q31(ptr nocapture readonly %pSrcA, ptr no
; CHECK-NEXT: .LBB3_7: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r0, [r12], #4
-; CHECK-NEXT: ldr r1, [r10], #4
+; CHECK-NEXT: ldr r1, [r11], #4
; CHECK-NEXT: umull r0, r1, r1, r0
; CHECK-NEXT: lsrl r0, r1, #31
-; CHECK-NEXT: subs.w r2, r0, #-1
+; CHECK-NEXT: subs.w r3, r0, #-1
; CHECK-NEXT: sbcs r1, r1, #0
; CHECK-NEXT: it hs
; CHECK-NEXT: movhs.w r0, #-1
-; CHECK-NEXT: str r0, [r11], #4
+; CHECK-NEXT: str r0, [r2], #4
; CHECK-NEXT: le lr, .LBB3_7
; CHECK-NEXT: .LBB3_8: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #8
+; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
switch i32 %N, label %vector.ph [
@@ -761,78 +750,69 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-NEXT: .pad #16
-; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: beq.w .LBB4_8
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT: movs r7, #0
+; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: bhi .LBB4_3
; CHECK-NEXT: @ %bb.2:
-; CHECK-NEXT: mov r10, r1
; CHECK-NEXT: mov r12, r0
-; CHECK-NEXT: mov r1, r2
+; CHECK-NEXT: mov r9, r1
+; CHECK-NEXT: mov r11, r2
; CHECK-NEXT: b .LBB4_6
; CHECK-NEXT: .LBB4_3: @ %vector.ph
-; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT: bic r3, r3, #3
-; CHECK-NEXT: subs r7, r3, #4
+; CHECK-NEXT: bic r8, r3, #3
; CHECK-NEXT: movs r6, #1
-; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: add.w r10, r1, r3, lsl #2
-; CHECK-NEXT: add.w lr, r6, r7, lsr #2
-; CHECK-NEXT: add.w r7, r2, r3, lsl #2
-; CHECK-NEXT: str r7, [sp] @ 4-byte Spill
-; CHECK-NEXT: add.w r12, r0, r3, lsl #2
+; CHECK-NEXT: sub.w r7, r8, #4
; CHECK-NEXT: vmov.i64 q0, #0xffffffff
+; CHECK-NEXT: add.w r11, r2, r8, lsl #2
+; CHECK-NEXT: add.w r9, r1, r8, lsl #2
+; CHECK-NEXT: add.w lr, r6, r7, lsr #2
+; CHECK-NEXT: add.w r12, r0, r8, lsl #2
; CHECK-NEXT: .LBB4_4: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
-; CHECK-NEXT: movs r6, #0
-; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: vmov.f32 s12, s6
; CHECK-NEXT: vmov.f32 s14, s7
; CHECK-NEXT: vmov.f32 s16, s10
; CHECK-NEXT: vmov.f32 s18, s11
; CHECK-NEXT: vmullb.u32 q5, q4, q3
; CHECK-NEXT: vmov.f32 s6, s5
-; CHECK-NEXT: vmov r4, r9, d10
-; CHECK-NEXT: lsrl r4, r9, #31
+; CHECK-NEXT: vmov r10, r5, d10
+; CHECK-NEXT: lsrl r10, r5, #31
; CHECK-NEXT: vmov.f32 s10, s9
-; CHECK-NEXT: subs.w r5, r4, #-1
-; CHECK-NEXT: sbcs r5, r9, #0
-; CHECK-NEXT: vmullb.u32 q4, q2, q1
+; CHECK-NEXT: subs.w r6, r10, #-1
+; CHECK-NEXT: sbcs r5, r5, #0
+; CHECK-NEXT: mov.w r6, #0
; CHECK-NEXT: csetm r5, lo
+; CHECK-NEXT: vmullb.u32 q4, q2, q1
; CHECK-NEXT: bfi r6, r5, #0, #8
-; CHECK-NEXT: vmov r8, r5, d11
-; CHECK-NEXT: lsrl r8, r5, #31
-; CHECK-NEXT: subs.w r11, r8, #-1
-; CHECK-NEXT: vmov q3[2], q3[0], r4, r8
-; CHECK-NEXT: sbcs r7, r5, #0
-; CHECK-NEXT: vmov q3[3], q3[1], r9, r5
-; CHECK-NEXT: csetm r7, lo
-; CHECK-NEXT: bfi r6, r7, #8, #8
-; CHECK-NEXT: vmov r4, r7, d8
-; CHECK-NEXT: lsrl r4, r7, #31
+; CHECK-NEXT: vmov r4, r5, d11
+; CHECK-NEXT: lsrl r4, r5, #31
+; CHECK-NEXT: subs.w r7, r4, #-1
+; CHECK-NEXT: vmov q3[2], q3[0], r10, r4
+; CHECK-NEXT: sbcs r5, r5, #0
+; CHECK-NEXT: csetm r5, lo
+; CHECK-NEXT: bfi r6, r5, #8, #8
+; CHECK-NEXT: vmov r10, r5, d8
+; CHECK-NEXT: lsrl r10, r5, #31
; CHECK-NEXT: vmsr p0, r6
-; CHECK-NEXT: subs.w r5, r4, #-1
-; CHECK-NEXT: mov.w r6, #0
-; CHECK-NEXT: sbcs r5, r7, #0
+; CHECK-NEXT: subs.w r6, r10, #-1
; CHECK-NEXT: vpsel q3, q3, q0
+; CHECK-NEXT: sbcs r5, r5, #0
+; CHECK-NEXT: mov.w r6, #0
; CHECK-NEXT: csetm r5, lo
; CHECK-NEXT: bfi r6, r5, #0, #8
-; CHECK-NEXT: vmov r2, r5, d9
-; CHECK-NEXT: lsrl r2, r5, #31
-; CHECK-NEXT: subs.w r3, r2, #-1
-; CHECK-NEXT: vmov q1[2], q1[0], r4, r2
-; CHECK-NEXT: sbcs r3, r5, #0
-; CHECK-NEXT: vmov q1[3], q1[1], r7, r5
-; CHECK-NEXT: csetm r3, lo
-; CHECK-NEXT: bfi r6, r3, #8, #8
+; CHECK-NEXT: vmov r4, r5, d9
+; CHECK-NEXT: lsrl r4, r5, #31
+; CHECK-NEXT: subs.w r7, r4, #-1
+; CHECK-NEXT: vmov q1[2], q1[0], r10, r4
+; CHECK-NEXT: sbcs r5, r5, #0
+; CHECK-NEXT: csetm r5, lo
+; CHECK-NEXT: bfi r6, r5, #8, #8
; CHECK-NEXT: vmsr p0, r6
-; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: vpsel q1, q1, q0
; CHECK-NEXT: vmov.f32 s5, s6
; CHECK-NEXT: vmov.f32 s6, s12
@@ -840,26 +820,23 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no
; CHECK-NEXT: vstrb.8 q1, [r2], #16
; CHECK-NEXT: le lr, .LBB4_4
; CHECK-NEXT: @ %bb.5: @ %middle.block
-; CHECK-NEXT: ldrd r7, r3, [sp, #4] @ 8-byte Folded Reload
-; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
-; CHECK-NEXT: cmp r7, r3
+; CHECK-NEXT: cmp r8, r3
; CHECK-NEXT: beq .LBB4_8
; CHECK-NEXT: .LBB4_6: @ %for.body.preheader21
-; CHECK-NEXT: sub.w lr, r3, r7
+; CHECK-NEXT: sub.w lr, r3, r8
; CHECK-NEXT: .LBB4_7: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r0, [r12], #4
-; CHECK-NEXT: ldr r2, [r10], #4
-; CHECK-NEXT: umull r0, r3, r2, r0
-; CHECK-NEXT: lsrl r0, r3, #31
+; CHECK-NEXT: ldr r1, [r9], #4
+; CHECK-NEXT: umull r0, r1, r1, r0
+; CHECK-NEXT: lsrl r0, r1, #31
; CHECK-NEXT: subs.w r2, r0, #-1
-; CHECK-NEXT: sbcs r2, r3, #0
+; CHECK-NEXT: sbcs r1, r1, #0
; CHECK-NEXT: it hs
; CHECK-NEXT: movhs.w r0, #-1
-; CHECK-NEXT: str r0, [r1], #4
+; CHECK-NEXT: str r0, [r11], #4
; CHECK-NEXT: le lr, .LBB4_7
; CHECK-NEXT: .LBB4_8: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
index d49973a674a21..87df13787c6c8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
@@ -35,10 +35,15 @@ entry:
define arm_aapcs_vfpcc void @unscaled_v2i8_i8(ptr %base, ptr %offptr, <2 x i8> %input) {
; CHECK-LABEL: unscaled_v2i8_i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: ldrb r2, [r1]
+; CHECK-NEXT: vmov.i32 q1, #0xff
; CHECK-NEXT: ldrb r1, [r1, #1]
-; CHECK-NEXT: strb r3, [r0, r2]
+; CHECK-NEXT: vmov q2[2], q2[0], r2, r1
+; CHECK-NEXT: vmov r2, s0
+; CHECK-NEXT: vand q1, q2, q1
+; CHECK-NEXT: vmov r1, s4
+; CHECK-NEXT: strb r2, [r0, r1]
+; CHECK-NEXT: vmov r1, s6
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: strb r2, [r0, r1]
; CHECK-NEXT: bx lr
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
index 1c19e02b7a5fb..c92c2be2834e7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
@@ -443,7 +443,7 @@ entry:
define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %b) {
; CHECK-LABEL: add_v2i16_v2i64_sext:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.i64 q2, #0xffff
+; CHECK-NEXT: vmov.i32 q2, #0xffff
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vmov r0, s4
@@ -1361,7 +1361,7 @@ entry:
define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %b) {
; CHECK-LABEL: add_v2i8_v2i64_sext:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.i64 q2, #0xff
+; CHECK-NEXT: vmov.i32 q2, #0xff
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vmov r0, s4
@@ -1868,7 +1868,7 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %b,
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: vmov.i64 q2, #0xffff
+; CHECK-NEXT: vmov.i32 q2, #0xffff
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vmov r2, s4
@@ -2540,7 +2540,7 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %b, i6
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: vmov.i64 q2, #0xff
+; CHECK-NEXT: vmov.i32 q2, #0xff
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vmov r2, s4
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
index bdc1322826e59..9f551836c8b17 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
@@ -393,7 +393,7 @@ entry:
define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b) {
; CHECK-LABEL: add_v2i16_v2i64_sext:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.i64 q3, #0xffff
+; CHECK-NEXT: vmov.i32 q3, #0xffff
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: vand q2, q2, q3
; CHECK-NEXT: vmov r2, s4
@@ -1582,7 +1582,7 @@ entry:
define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b) {
; CHECK-LABEL: add_v2i8_v2i64_sext:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.i64 q3, #0xff
+; CHECK-NEXT: vmov.i32 q3, #0xff
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: vand q2, q2, q3
; CHECK-NEXT: vmov r2, s4
@@ -2015,7 +2015,7 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y,
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: vmov.i64 q3, #0xffff
+; CHECK-NEXT: vmov.i32 q3, #0xffff
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vand q2, q2, q3
; CHECK-NEXT: vmov r2, s8
@@ -2905,7 +2905,7 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, <2
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: vmov.i64 q3, #0xff
+; CHECK-NEXT: vmov.i32 q3, #0xff
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vand q2, q2, q3
; CHECK-NEXT: vmov r2, s8
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index 5a0a9b7c3c25c..6755ee0b174df 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -5638,10 +5638,7 @@ define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, ptr %ptr_b) {
; X86-LABEL: test_mask_mul_epu32_rmb:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08]
-; X86-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x58,0xdb,0x0d,A,A,A,A]
-; X86-NEXT: ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0xc1]
+; X86-NEXT: vpmuludq (%eax){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x58,0xf4,0x00]
; X86-NEXT: retl ## encoding: [0xc3]
;
; X64-LABEL: test_mask_mul_epu32_rmb:
@@ -5660,12 +5657,9 @@ define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64>
; X86-LABEL: test_mask_mul_epu32_rmbk:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT: vpbroadcastd (%eax), %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x10]
-; X86-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x58,0xdb,0x15,A,A,A,A]
-; X86-NEXT: ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT: vpmuludq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0xca]
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT: vpmuludq (%eax){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xf4,0x08]
; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
; X86-NEXT: retl ## encoding: [0xc3]
;
@@ -5687,12 +5681,9 @@ define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)
; X86-LABEL: test_mask_mul_epu32_rmbkz:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08]
-; X86-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x58,0xdb,0x0d,A,A,A,A]
-; X86-NEXT: ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0xc1]
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT: vpmuludq (%eax){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xf4,0x00]
; X86-NEXT: retl ## encoding: [0xc3]
;
; X64-LABEL: test_mask_mul_epu32_rmbkz:
@@ -7386,10 +7377,7 @@ define <8 x i64> @test_mul_epu32_rmb(<16 x i32> %a, ptr %ptr_b) {
; X86-LABEL: test_mul_epu32_rmb:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08]
-; X86-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x58,0xdb,0x0d,A,A,A,A]
-; X86-NEXT: ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0xc1]
+; X86-NEXT: vpmuludq (%eax){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x58,0xf4,0x00]
; X86-NEXT: retl ## encoding: [0xc3]
;
; X64-LABEL: test_mul_epu32_rmb:
@@ -7408,12 +7396,9 @@ define <8 x i64> @test_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %pass
; X86-LABEL: test_mul_epu32_rmbk:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT: vpbroadcastd (%eax), %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x10]
-; X86-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x58,0xdb,0x15,A,A,A,A]
-; X86-NEXT: ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT: vpmuludq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0xca]
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT: vpmuludq (%eax){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xf4,0x08]
; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
; X86-NEXT: retl ## encoding: [0xc3]
;
@@ -7437,12 +7422,9 @@ define <8 x i64> @test_mul_epu32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) {
; X86-LABEL: test_mul_epu32_rmbkz:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08]
-; X86-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x58,0xdb,0x0d,A,A,A,A]
-; X86-NEXT: ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0xc1]
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT: vpmuludq (%eax){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xf4,0x00]
; X86-NEXT: retl ## encoding: [0xc3]
;
; X64-LABEL: test_mul_epu32_rmbkz:
diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
index c8504ed7151fb..99f666072fbbc 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@@ -11548,11 +11548,7 @@ define < 2 x i64> @test_mask_mul_epu32_rmb_128(< 4 x i32> %a, ptr %ptr_b) {
; X86-LABEL: test_mask_mul_epu32_rmb_128:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT: vpbroadcastd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x08]
-; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; X86-NEXT: vpblendd $10, %xmm2, %xmm1, %xmm1 # encoding: [0xc4,0xe3,0x71,0x02,0xca,0x0a]
-; X86-NEXT: # xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf4,0xc1]
+; X86-NEXT: vpmuludq (%eax){1to2}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x18,0xf4,0x00]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_mask_mul_epu32_rmb_128:
@@ -11571,13 +11567,9 @@ define < 2 x i64> @test_mask_mul_epu32_rmbk_128(< 4 x i32> %a, ptr %ptr_b, < 2 x
; X86-LABEL: test_mask_mul_epu32_rmbk_128:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT: vpbroadcastd (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x10]
-; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
-; X86-NEXT: vpblendd $10, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x02,0xd3,0x0a]
-; X86-NEXT: # xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3]
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT: vpmuludq %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xf4,0xca]
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT: vpmuludq (%eax){1to2}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x19,0xf4,0x08]
; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
@@ -11599,13 +11591,9 @@ define < 2 x i64> @test_mask_mul_epu32_rmbkz_128(< 4 x i32> %a, ptr %ptr_b, i8 %
; X86-LABEL: test_mask_mul_epu32_rmbkz_128:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT: vpbroadcastd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x08]
-; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; X86-NEXT: vpblendd $10, %xmm2, %xmm1, %xmm1 # encoding: [0xc4,0xe3,0x71,0x02,0xca,0x0a]
-; X86-NEXT: # xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0xf4,0xc1]
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT: vpmuludq (%eax){1to2}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x99,0xf4,0x00]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_mask_mul_epu32_rmbkz_128:
@@ -11728,11 +11716,7 @@ define < 4 x i64> @test_mask_mul_epu32_rmb_256(< 8 x i32> %a, ptr %ptr_b) {
; X86-LABEL: test_mask_mul_epu32_rmb_256:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT: vpbroadcastd (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x08]
-; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; X86-NEXT: vpblendd $170, %ymm2, %ymm1, %ymm1 # encoding: [0xc4,0xe3,0x75,0x02,0xca,0xaa]
-; X86-NEXT: # ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
-; X86-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf4,0xc1]
+; X86-NEXT: vpmuludq (%eax){1to4}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x38,0xf4,0x00]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_mask_mul_epu32_rmb_256:
@@ -11751,13 +11735,9 @@ define < 4 x i64> @test_mask_mul_epu32_rmbk_256(< 8 x i32> %a, ptr %ptr_b, < 4 x
; X86-LABEL: test_mask_mul_epu32_rmbk_256:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT: vpbroadcastd (%eax), %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x10]
-; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
-; X86-NEXT: vpblendd $170, %ymm3, %ymm2, %ymm2 # encoding: [0xc4,0xe3,0x6d,0x02,0xd3,0xaa]
-; X86-NEXT: # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT: vpmuludq %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xf4,0xca]
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT: vpmuludq (%eax){1to4}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x39,0xf4,0x08]
; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
@@ -11779,13 +11759,9 @@ define < 4 x i64> @test_mask_mul_epu32_rmbkz_256(< 8 x i32> %a, ptr %ptr_b, i8 %
; X86-LABEL: test_mask_mul_epu32_rmbkz_256:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT: vpbroadcastd (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x08]
-; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; X86-NEXT: vpblendd $170, %ymm2, %ymm1, %ymm1 # encoding: [0xc4,0xe3,0x75,0x02,0xca,0xaa]
-; X86-NEXT: # ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0xf4,0xc1]
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT: vpmuludq (%eax){1to4}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xb9,0xf4,0x00]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_mask_mul_epu32_rmbkz_256:
diff --git a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
index 384a4c8f889ad..6a2550ede2fb7 100644
--- a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
@@ -2698,9 +2698,8 @@ define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b)
; NoVLX-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2724,9 +2723,8 @@ define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem(<2 x i64> %__a, ptr %__b) lo
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2753,9 +2751,8 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2784,9 +2781,8 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2814,9 +2810,8 @@ define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, ptr %__b)
; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2843,9 +2838,8 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7502,9 +7496,8 @@ define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b)
; NoVLX-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7528,9 +7521,8 @@ define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem(<2 x i64> %__a, ptr %__b) l
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7557,9 +7549,8 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7588,9 +7579,8 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7618,9 +7608,8 @@ define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, ptr %__b)
; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7647,9 +7636,8 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12366,9 +12354,8 @@ define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b)
; NoVLX-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: vpcmpnltq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12392,9 +12379,8 @@ define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem(<2 x i64> %__a, ptr %__b) l
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
; NoVLX-NEXT: vpcmpnltq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12421,9 +12407,8 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpnltq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12452,9 +12437,8 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpnltq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12482,9 +12466,8 @@ define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, ptr %__b)
; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: vpcmpnltq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12511,9 +12494,8 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpnltq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17250,9 +17232,8 @@ define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b)
; NoVLX-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17276,9 +17257,8 @@ define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem(<2 x i64> %__a, ptr %__b) l
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17305,9 +17285,8 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17336,9 +17315,8 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17366,9 +17344,8 @@ define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, ptr %__b)
; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17395,9 +17372,8 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21081,9 +21057,8 @@ define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b)
; NoVLX-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21107,9 +21082,8 @@ define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem(<2 x i64> %__a, ptr %__b) l
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: vmovapd (%rdi), %xmm1
; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21133,9 +21107,8 @@ define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, ptr %__b)
; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21163,9 +21136,8 @@ define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask(i2 zeroext %__u, <2 x i6
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21193,9 +21165,8 @@ define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem(i2 zeroext %__u, <2
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vmovapd (%rsi), %xmm1
; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21223,9 +21194,8 @@ define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b(i2 zeroext %__u, <
; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $14, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: andl $3, %eax
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll b/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll
index 07c81c6f8b7a4..56957645dd9fc 100644
--- a/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll
+++ b/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll
@@ -267,8 +267,8 @@ define i64 @dot_ext_v2i8_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind {
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSE2-NEXT: pmuludq %xmm0, %xmm1
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; SSE2-NEXT: pmuludq %xmm0, %xmm3
@@ -463,15 +463,15 @@ define i64 @dot_ext_v2i32_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind {
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; SSE2-NEXT: pmuludq %xmm1, %xmm3
; SSE2-NEXT: psllq $32, %xmm3
-; SSE2-NEXT: paddq %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: paddq %xmm3, %xmm0
-; SSE2-NEXT: movq %xmm0, %rax
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT: paddq %xmm0, %xmm1
+; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: retq
;
; SSE4-LABEL: dot_ext_v2i32_v2i64:
diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll
index 69c6d8d4cf2d6..da9e4a32f48a3 100644
--- a/llvm/test/CodeGen/X86/combine-pmuldq.ll
+++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll
@@ -44,43 +44,13 @@ define <2 x i64> @combine_shuffle_zext_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
define <2 x i64> @combine_shuffle_zero_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-LABEL: combine_shuffle_zero_pmuludq:
; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE-NEXT: pmuludq %xmm2, %xmm0
+; SSE-NEXT: pmuludq %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX1-LABEL: combine_shuffle_zero_pmuludq:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: combine_shuffle_zero_pmuludq:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: combine_shuffle_zero_pmuludq:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512DQVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: retq
+; AVX-LABEL: combine_shuffle_zero_pmuludq:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
%2 = shufflevector <4 x i32> %a1, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
%3 = bitcast <4 x i32> %1 to <2 x i64>
@@ -92,13 +62,8 @@ define <2 x i64> @combine_shuffle_zero_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
define <4 x i64> @combine_shuffle_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
; SSE-LABEL: combine_shuffle_zero_pmuludq_256:
; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm4, %xmm4
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
-; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; SSE-NEXT: pmuludq %xmm2, %xmm0
; SSE-NEXT: pmuludq %xmm3, %xmm1
-; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; SSE-NEXT: pmuludq %xmm4, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_shuffle_zero_pmuludq_256:
@@ -115,25 +80,16 @@ define <4 x i64> @combine_shuffle_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1)
;
; AVX2-LABEL: combine_shuffle_zero_pmuludq_256:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: combine_shuffle_zero_pmuludq_256:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
; AVX512VL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq_256:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512DQVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
-; AVX512DQVL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
; AVX512DQVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX512DQVL-NEXT: retq
%1 = shufflevector <8 x i32> %a0, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
@@ -164,16 +120,15 @@ define <8 x i64> @combine_zext_pmuludq_256(<8 x i32> %a) {
; AVX1-LABEL: combine_zext_pmuludq_256:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3]
; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [715827883,715827883]
; AVX1-NEXT: # xmm4 = mem[0,0]
-; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 906c6eeea04ad..f3d08c9d7f023 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -143,18 +143,17 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: subq $120, %rsp
; CHECK-SSE-NEXT: .cfi_def_cfa_offset 128
-; CHECK-SSE-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; CHECK-SSE-NEXT: pslld $23, %xmm2
-; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
-; CHECK-SSE-NEXT: paddd %xmm3, %xmm2
-; CHECK-SSE-NEXT: cvttps2dq %xmm2, %xmm2
-; CHECK-SSE-NEXT: pslld $16, %xmm2
-; CHECK-SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill
-; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; CHECK-SSE-NEXT: pslld $23, %xmm1
+; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
+; CHECK-SSE-NEXT: paddd %xmm2, %xmm1
+; CHECK-SSE-NEXT: cvttps2dq %xmm1, %xmm1
+; CHECK-SSE-NEXT: pslld $16, %xmm1
+; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
+; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; CHECK-SSE-NEXT: pslld $23, %xmm0
-; CHECK-SSE-NEXT: paddd %xmm3, %xmm0
+; CHECK-SSE-NEXT: paddd %xmm2, %xmm0
; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0
; CHECK-SSE-NEXT: pslld $16, %xmm0
; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1105,8 +1104,7 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: subq $40, %rsp
-; CHECK-SSE-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; CHECK-SSE-NEXT: pslld $23, %xmm0
; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
index 3a16c2dd7e801..7001bf7f28071 100644
--- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
@@ -17,32 +17,32 @@ define <16 x i8> @var_fshl_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nou
; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
; GFNISSE-NEXT: pslld $23, %xmm2
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216]
-; GFNISSE-NEXT: paddd %xmm7, %xmm2
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
+; GFNISSE-NEXT: paddd %xmm6, %xmm2
; GFNISSE-NEXT: cvttps2dq %xmm2, %xmm2
-; GFNISSE-NEXT: pslld $23, %xmm6
-; GFNISSE-NEXT: paddd %xmm7, %xmm6
-; GFNISSE-NEXT: cvttps2dq %xmm6, %xmm6
-; GFNISSE-NEXT: packusdw %xmm2, %xmm6
-; GFNISSE-NEXT: movdqa %xmm1, %xmm8
-; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
-; GFNISSE-NEXT: pmullw %xmm6, %xmm8
-; GFNISSE-NEXT: psrlw $8, %xmm8
+; GFNISSE-NEXT: pslld $23, %xmm3
+; GFNISSE-NEXT: paddd %xmm6, %xmm3
+; GFNISSE-NEXT: cvttps2dq %xmm3, %xmm3
+; GFNISSE-NEXT: packusdw %xmm2, %xmm3
+; GFNISSE-NEXT: movdqa %xmm1, %xmm7
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
+; GFNISSE-NEXT: pmullw %xmm3, %xmm7
+; GFNISSE-NEXT: psrlw $8, %xmm7
; GFNISSE-NEXT: pslld $23, %xmm4
-; GFNISSE-NEXT: paddd %xmm7, %xmm4
+; GFNISSE-NEXT: paddd %xmm6, %xmm4
; GFNISSE-NEXT: cvttps2dq %xmm4, %xmm2
-; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
; GFNISSE-NEXT: pslld $23, %xmm5
-; GFNISSE-NEXT: paddd %xmm7, %xmm5
+; GFNISSE-NEXT: paddd %xmm6, %xmm5
; GFNISSE-NEXT: cvttps2dq %xmm5, %xmm3
; GFNISSE-NEXT: packusdw %xmm3, %xmm2
; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; GFNISSE-NEXT: pmullw %xmm1, %xmm2
; GFNISSE-NEXT: psrlw $8, %xmm2
-; GFNISSE-NEXT: packuswb %xmm8, %xmm2
+; GFNISSE-NEXT: packuswb %xmm7, %xmm2
; GFNISSE-NEXT: movdqa %xmm2, %xmm0
; GFNISSE-NEXT: retq
;
@@ -50,34 +50,34 @@ define <16 x i8> @var_fshl_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nou
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; GFNIAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; GFNIAVX1-NEXT: vpslld $23, %xmm5, %xmm5
-; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
-; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
-; GFNIAVX1-NEXT: vcvttps2dq %xmm5, %xmm5
-; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4,4,5,5,6,6,7,7]
; GFNIAVX1-NEXT: vpslld $23, %xmm4, %xmm4
-; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; GFNIAVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vcvttps2dq %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; GFNIAVX1-NEXT: vpslld $23, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vcvttps2dq %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; GFNIAVX1-NEXT: vpslld $23, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
; GFNIAVX1-NEXT: vcvttps2dq %xmm4, %xmm4
-; GFNIAVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4
-; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; GFNIAVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4
-; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; GFNIAVX1-NEXT: vpslld $23, %xmm5, %xmm5
-; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
-; GFNIAVX1-NEXT: vcvttps2dq %xmm5, %xmm5
; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
; GFNIAVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
; GFNIAVX1-NEXT: vcvttps2dq %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
+; GFNIAVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2
; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; GFNIAVX1-NEXT: retq
;
; GFNIAVX2-LABEL: var_fshl_v16i8:
@@ -541,7 +541,7 @@ define <32 x i8> @var_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou
; GFNISSE-NEXT: movdqa %xmm4, %xmm10
; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm7[8],xmm10[9],xmm7[9],xmm10[10],xmm7[10],xmm10[11],xmm7[11],xmm10[12],xmm7[12],xmm10[13],xmm7[13],xmm10[14],xmm7[14],xmm10[15],xmm7[15]
; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
-; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7]
; GFNISSE-NEXT: pslld $23, %xmm10
; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
; GFNISSE-NEXT: paddd %xmm4, %xmm10
@@ -557,7 +557,7 @@ define <32 x i8> @var_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou
; GFNISSE-NEXT: pslld $23, %xmm0
; GFNISSE-NEXT: paddd %xmm4, %xmm0
; GFNISSE-NEXT: cvttps2dq %xmm0, %xmm0
-; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7]
; GFNISSE-NEXT: pslld $23, %xmm9
; GFNISSE-NEXT: paddd %xmm4, %xmm9
; GFNISSE-NEXT: cvttps2dq %xmm9, %xmm9
@@ -570,23 +570,23 @@ define <32 x i8> @var_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou
; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
-; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm8 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
-; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
+; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
; GFNISSE-NEXT: pslld $23, %xmm5
; GFNISSE-NEXT: paddd %xmm4, %xmm5
; GFNISSE-NEXT: cvttps2dq %xmm5, %xmm5
-; GFNISSE-NEXT: pslld $23, %xmm8
-; GFNISSE-NEXT: paddd %xmm4, %xmm8
-; GFNISSE-NEXT: cvttps2dq %xmm8, %xmm8
-; GFNISSE-NEXT: packusdw %xmm5, %xmm8
+; GFNISSE-NEXT: pslld $23, %xmm7
+; GFNISSE-NEXT: paddd %xmm4, %xmm7
+; GFNISSE-NEXT: cvttps2dq %xmm7, %xmm7
+; GFNISSE-NEXT: packusdw %xmm5, %xmm7
; GFNISSE-NEXT: movdqa %xmm3, %xmm5
; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
-; GFNISSE-NEXT: pmullw %xmm8, %xmm5
+; GFNISSE-NEXT: pmullw %xmm7, %xmm5
; GFNISSE-NEXT: psrlw $8, %xmm5
; GFNISSE-NEXT: pslld $23, %xmm2
; GFNISSE-NEXT: paddd %xmm4, %xmm2
; GFNISSE-NEXT: cvttps2dq %xmm2, %xmm2
-; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
; GFNISSE-NEXT: pslld $23, %xmm6
; GFNISSE-NEXT: paddd %xmm4, %xmm6
; GFNISSE-NEXT: cvttps2dq %xmm6, %xmm4
@@ -601,17 +601,17 @@ define <32 x i8> @var_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou
; GFNIAVX1-LABEL: var_fshl_v32i8:
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; GFNIAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
-; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
-; GFNIAVX1-NEXT: vpslld $23, %xmm4, %xmm7
-; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; GFNIAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT: vpslld $23, %xmm3, %xmm7
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm7, %xmm7
; GFNIAVX1-NEXT: vcvttps2dq %xmm7, %xmm7
; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
; GFNIAVX1-NEXT: vpslld $23, %xmm6, %xmm6
-; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6
; GFNIAVX1-NEXT: vcvttps2dq %xmm6, %xmm6
; GFNIAVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6
; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
@@ -619,48 +619,48 @@ define <32 x i8> @var_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou
; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
; GFNIAVX1-NEXT: vpmullw %xmm6, %xmm9, %xmm6
; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
-; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
; GFNIAVX1-NEXT: vpslld $23, %xmm9, %xmm9
-; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm9, %xmm9
+; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm9, %xmm9
; GFNIAVX1-NEXT: vcvttps2dq %xmm9, %xmm9
-; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT: vpslld $23, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vcvttps2dq %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpackusdw %xmm4, %xmm9, %xmm4
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; GFNIAVX1-NEXT: vpmullw %xmm4, %xmm7, %xmm4
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT: vpslld $23, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vcvttps2dq %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
; GFNIAVX1-NEXT: vpslld $23, %xmm5, %xmm5
-; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm5
; GFNIAVX1-NEXT: vcvttps2dq %xmm5, %xmm5
-; GFNIAVX1-NEXT: vpackusdw %xmm5, %xmm9, %xmm5
-; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
-; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm7, %xmm5
+; GFNIAVX1-NEXT: vpackusdw %xmm6, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5
; GFNIAVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
-; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm5, %xmm5
-; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
-; GFNIAVX1-NEXT: vpslld $23, %xmm7, %xmm7
-; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7
-; GFNIAVX1-NEXT: vcvttps2dq %xmm7, %xmm7
-; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; GFNIAVX1-NEXT: vpslld $23, %xmm6, %xmm6
-; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6
; GFNIAVX1-NEXT: vcvttps2dq %xmm6, %xmm6
-; GFNIAVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6
-; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; GFNIAVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6
-; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
-; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; GFNIAVX1-NEXT: vpslld $23, %xmm7, %xmm7
-; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7
-; GFNIAVX1-NEXT: vcvttps2dq %xmm7, %xmm7
; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
; GFNIAVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; GFNIAVX1-NEXT: vcvttps2dq %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpackusdw %xmm2, %xmm7, %xmm2
+; GFNIAVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2
; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; GFNIAVX1-NEXT: vpackuswb %xmm5, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; GFNIAVX1-NEXT: retq
;
; GFNIAVX2-LABEL: var_fshl_v32i8:
@@ -1365,15 +1365,15 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
; GFNISSE: # %bb.0:
; GFNISSE-NEXT: movdqa %xmm1, %xmm8
; GFNISSE-NEXT: movdqa %xmm0, %xmm1
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm10 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0
-; GFNISSE-NEXT: pand %xmm10, %xmm0
-; GFNISSE-NEXT: pxor %xmm9, %xmm9
+; GFNISSE-NEXT: pand %xmm9, %xmm0
+; GFNISSE-NEXT: pxor %xmm10, %xmm10
; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm12 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15]
; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; GFNISSE-NEXT: pslld $23, %xmm0
; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [1065353216,1065353216,1065353216,1065353216]
; GFNISSE-NEXT: paddd %xmm11, %xmm0
@@ -1389,7 +1389,7 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
; GFNISSE-NEXT: pslld $23, %xmm12
; GFNISSE-NEXT: paddd %xmm11, %xmm12
; GFNISSE-NEXT: cvttps2dq %xmm12, %xmm0
-; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7]
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4,4,5,5,6,6,7,7]
; GFNISSE-NEXT: pslld $23, %xmm13
; GFNISSE-NEXT: paddd %xmm11, %xmm13
; GFNISSE-NEXT: cvttps2dq %xmm13, %xmm12
@@ -1399,12 +1399,12 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
; GFNISSE-NEXT: psrlw $8, %xmm0
; GFNISSE-NEXT: packuswb %xmm15, %xmm0
; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
-; GFNISSE-NEXT: pand %xmm10, %xmm1
+; GFNISSE-NEXT: pand %xmm9, %xmm1
; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm12 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15]
; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm13 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; GFNISSE-NEXT: pslld $23, %xmm1
; GFNISSE-NEXT: paddd %xmm11, %xmm1
; GFNISSE-NEXT: cvttps2dq %xmm1, %xmm1
@@ -1419,7 +1419,7 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
; GFNISSE-NEXT: pslld $23, %xmm4
; GFNISSE-NEXT: paddd %xmm11, %xmm4
; GFNISSE-NEXT: cvttps2dq %xmm4, %xmm1
-; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4,4,5,5,6,6,7,7]
; GFNISSE-NEXT: pslld $23, %xmm12
; GFNISSE-NEXT: paddd %xmm11, %xmm12
; GFNISSE-NEXT: cvttps2dq %xmm12, %xmm4
@@ -1429,12 +1429,12 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
; GFNISSE-NEXT: psrlw $8, %xmm1
; GFNISSE-NEXT: packuswb %xmm14, %xmm1
; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4
-; GFNISSE-NEXT: pand %xmm10, %xmm4
+; GFNISSE-NEXT: pand %xmm9, %xmm4
; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm12 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
-; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
; GFNISSE-NEXT: pslld $23, %xmm4
; GFNISSE-NEXT: paddd %xmm11, %xmm4
; GFNISSE-NEXT: cvttps2dq %xmm4, %xmm4
@@ -1449,7 +1449,7 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
; GFNISSE-NEXT: pslld $23, %xmm5
; GFNISSE-NEXT: paddd %xmm11, %xmm5
; GFNISSE-NEXT: cvttps2dq %xmm5, %xmm4
-; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7]
; GFNISSE-NEXT: pslld $23, %xmm8
; GFNISSE-NEXT: paddd %xmm11, %xmm8
; GFNISSE-NEXT: cvttps2dq %xmm8, %xmm5
@@ -1458,15 +1458,15 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
; GFNISSE-NEXT: pmullw %xmm6, %xmm4
; GFNISSE-NEXT: psrlw $8, %xmm4
; GFNISSE-NEXT: packuswb %xmm13, %xmm4
-; GFNISSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm10
-; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero
-; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero
-; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
-; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
-; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; GFNISSE-NEXT: pslld $23, %xmm10
-; GFNISSE-NEXT: paddd %xmm11, %xmm10
-; GFNISSE-NEXT: cvttps2dq %xmm10, %xmm8
+; GFNISSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm9
+; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15]
+; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT: pslld $23, %xmm9
+; GFNISSE-NEXT: paddd %xmm11, %xmm9
+; GFNISSE-NEXT: cvttps2dq %xmm9, %xmm8
; GFNISSE-NEXT: pslld $23, %xmm5
; GFNISSE-NEXT: paddd %xmm11, %xmm5
; GFNISSE-NEXT: cvttps2dq %xmm5, %xmm5
@@ -1478,7 +1478,7 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
; GFNISSE-NEXT: pslld $23, %xmm2
; GFNISSE-NEXT: paddd %xmm11, %xmm2
; GFNISSE-NEXT: cvttps2dq %xmm2, %xmm5
-; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
; GFNISSE-NEXT: pslld $23, %xmm6
; GFNISSE-NEXT: paddd %xmm11, %xmm6
; GFNISSE-NEXT: cvttps2dq %xmm6, %xmm2
@@ -1496,16 +1496,16 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; GFNIAVX1-NEXT: vandps %ymm7, %ymm4, %ymm8
; GFNIAVX1-NEXT: vextractf128 $1, %ymm8, %xmm9
-; GFNIAVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15]
-; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
-; GFNIAVX1-NEXT: vpslld $23, %xmm6, %xmm11
-; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
-; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm11, %xmm11
+; GFNIAVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT: vpslld $23, %xmm4, %xmm11
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm11, %xmm11
; GFNIAVX1-NEXT: vcvttps2dq %xmm11, %xmm11
; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
; GFNIAVX1-NEXT: vpslld $23, %xmm10, %xmm10
-; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm10, %xmm10
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm10, %xmm10
; GFNIAVX1-NEXT: vcvttps2dq %xmm10, %xmm10
; GFNIAVX1-NEXT: vpackusdw %xmm11, %xmm10, %xmm10
; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm11
@@ -1515,26 +1515,26 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
; GFNIAVX1-NEXT: vpsrlw $8, %xmm10, %xmm10
; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
; GFNIAVX1-NEXT: vpslld $23, %xmm13, %xmm13
-; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm13, %xmm13
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm13, %xmm13
; GFNIAVX1-NEXT: vcvttps2dq %xmm13, %xmm13
; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero
-; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7]
; GFNIAVX1-NEXT: vpslld $23, %xmm9, %xmm9
-; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm9, %xmm9
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm9, %xmm9
; GFNIAVX1-NEXT: vcvttps2dq %xmm9, %xmm9
; GFNIAVX1-NEXT: vpackusdw %xmm9, %xmm13, %xmm9
; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
; GFNIAVX1-NEXT: vpmullw %xmm9, %xmm11, %xmm9
; GFNIAVX1-NEXT: vpsrlw $8, %xmm9, %xmm9
; GFNIAVX1-NEXT: vpackuswb %xmm10, %xmm9, %xmm9
-; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
-; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4,4,5,5,6,6,7,7]
; GFNIAVX1-NEXT: vpslld $23, %xmm11, %xmm11
-; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm11, %xmm11
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm11, %xmm11
; GFNIAVX1-NEXT: vcvttps2dq %xmm11, %xmm11
; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
; GFNIAVX1-NEXT: vpslld $23, %xmm10, %xmm10
-; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm10, %xmm10
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm10, %xmm10
; GFNIAVX1-NEXT: vcvttps2dq %xmm10, %xmm10
; GFNIAVX1-NEXT: vpackusdw %xmm11, %xmm10, %xmm10
; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
@@ -1542,12 +1542,12 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
; GFNIAVX1-NEXT: vpsrlw $8, %xmm10, %xmm10
; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
; GFNIAVX1-NEXT: vpslld $23, %xmm11, %xmm11
-; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm11, %xmm11
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm11, %xmm11
; GFNIAVX1-NEXT: vcvttps2dq %xmm11, %xmm11
; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero
-; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7]
; GFNIAVX1-NEXT: vpslld $23, %xmm8, %xmm8
-; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm8, %xmm8
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm8, %xmm8
; GFNIAVX1-NEXT: vcvttps2dq %xmm8, %xmm8
; GFNIAVX1-NEXT: vpackusdw %xmm8, %xmm11, %xmm8
; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
@@ -1557,14 +1557,14 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0
; GFNIAVX1-NEXT: vandps %ymm7, %ymm5, %ymm2
; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
-; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4,4,5,5,6,6,7,7]
; GFNIAVX1-NEXT: vpslld $23, %xmm8, %xmm8
-; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm8, %xmm8
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm8, %xmm8
; GFNIAVX1-NEXT: vcvttps2dq %xmm8, %xmm8
; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
; GFNIAVX1-NEXT: vpslld $23, %xmm7, %xmm7
-; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7
; GFNIAVX1-NEXT: vcvttps2dq %xmm7, %xmm7
; GFNIAVX1-NEXT: vpackusdw %xmm8, %xmm7, %xmm7
; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm8
@@ -1574,45 +1574,45 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
; GFNIAVX1-NEXT: vpsrlw $8, %xmm7, %xmm7
; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
; GFNIAVX1-NEXT: vpslld $23, %xmm10, %xmm10
-; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm10, %xmm10
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm10, %xmm10
; GFNIAVX1-NEXT: vcvttps2dq %xmm10, %xmm10
; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
; GFNIAVX1-NEXT: vpslld $23, %xmm5, %xmm5
-; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5
; GFNIAVX1-NEXT: vcvttps2dq %xmm5, %xmm5
; GFNIAVX1-NEXT: vpackusdw %xmm5, %xmm10, %xmm5
; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm8, %xmm5
; GFNIAVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
; GFNIAVX1-NEXT: vpackuswb %xmm7, %xmm5, %xmm5
-; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
-; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; GFNIAVX1-NEXT: vpslld $23, %xmm8, %xmm8
-; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm8, %xmm8
-; GFNIAVX1-NEXT: vcvttps2dq %xmm8, %xmm8
-; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4,4,5,5,6,6,7,7]
; GFNIAVX1-NEXT: vpslld $23, %xmm7, %xmm7
-; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vcvttps2dq %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; GFNIAVX1-NEXT: vpslld $23, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vcvttps2dq %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; GFNIAVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; GFNIAVX1-NEXT: vpslld $23, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7
; GFNIAVX1-NEXT: vcvttps2dq %xmm7, %xmm7
-; GFNIAVX1-NEXT: vpackusdw %xmm8, %xmm7, %xmm7
-; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; GFNIAVX1-NEXT: vpmullw %xmm7, %xmm8, %xmm7
-; GFNIAVX1-NEXT: vpsrlw $8, %xmm7, %xmm7
-; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; GFNIAVX1-NEXT: vpslld $23, %xmm8, %xmm8
-; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm8, %xmm8
-; GFNIAVX1-NEXT: vcvttps2dq %xmm8, %xmm8
; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
; GFNIAVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
; GFNIAVX1-NEXT: vcvttps2dq %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpackusdw %xmm2, %xmm8, %xmm2
+; GFNIAVX1-NEXT: vpackusdw %xmm2, %xmm7, %xmm2
; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; GFNIAVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vpackuswb %xmm7, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm1, %xmm1
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
; GFNIAVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
index 8fd8cd92e9b13..e7089bb8f5a06 100644
--- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
@@ -542,14 +542,16 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq:
; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: movl $1, %eax
+; X86-SSE2-NEXT: movd %eax, %xmm2
; X86-SSE2-NEXT: pslld $23, %xmm1
; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [1,1,u,1]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X86-SSE2-NEXT: pand %xmm1, %xmm0
+; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
; X86-SSE2-NEXT: pxor %xmm1, %xmm1
; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; X86-SSE2-NEXT: retl
@@ -565,14 +567,16 @@ define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwi
;
; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq:
; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: movl $1, %eax
+; X64-SSE2-NEXT: movd %eax, %xmm2
; X64-SSE2-NEXT: pslld $23, %xmm1
; X64-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,u,1]
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X64-SSE2-NEXT: pand %xmm1, %xmm0
+; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
; X64-SSE2-NEXT: pxor %xmm1, %xmm1
; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; X64-SSE2-NEXT: retq
@@ -618,14 +622,16 @@ define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwi
define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq:
; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: movl $1, %eax
+; X86-SSE2-NEXT: movd %eax, %xmm2
; X86-SSE2-NEXT: pslld $23, %xmm1
; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [1,1,u,1]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X86-SSE2-NEXT: pand %xmm1, %xmm0
+; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
; X86-SSE2-NEXT: pxor %xmm1, %xmm1
; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; X86-SSE2-NEXT: retl
@@ -641,14 +647,16 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi
;
; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq:
; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: movl $1, %eax
+; X64-SSE2-NEXT: movd %eax, %xmm2
; X64-SSE2-NEXT: pslld $23, %xmm1
; X64-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,u,1]
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X64-SSE2-NEXT: pand %xmm1, %xmm0
+; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
; X64-SSE2-NEXT: pxor %xmm1, %xmm1
; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; X64-SSE2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll
index f6b8839afb2d7..63336ffa7c6c8 100644
--- a/llvm/test/CodeGen/X86/known-never-zero.ll
+++ b/llvm/test/CodeGen/X86/known-never-zero.ll
@@ -1243,8 +1243,7 @@ define i32 @mul_maybe_zero(i32 %x, i32 %y) {
define i32 @bitcast_known_nonzero(<2 x i16> %xx) {
; X86-LABEL: bitcast_known_nonzero:
; X86: # %bb.0:
-; X86-NEXT: pxor %xmm1, %xmm1
-; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; X86-NEXT: pslld $23, %xmm0
; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-NEXT: cvttps2dq %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/known-signbits-shl.ll b/llvm/test/CodeGen/X86/known-signbits-shl.ll
index ccda9a8b6ea7b..57d557dec11b9 100644
--- a/llvm/test/CodeGen/X86/known-signbits-shl.ll
+++ b/llvm/test/CodeGen/X86/known-signbits-shl.ll
@@ -70,8 +70,7 @@ define void @computeNumSignBits_shl_zext_vec_1(<2 x i8> %x, ptr %p) nounwind {
; X64-NEXT: movdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; X64-NEXT: pxor %xmm1, %xmm0
; X64-NEXT: psubb %xmm1, %xmm0
-; X64-NEXT: pxor %xmm1, %xmm1
-; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2048,8192,u,u,u,u,u,u]
; X64-NEXT: movd %xmm0, (%rdi)
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll
index 5368934fa5bf1..45b61155fe626 100644
--- a/llvm/test/CodeGen/X86/known-signbits-vector.ll
+++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll
@@ -192,10 +192,8 @@ define float @signbits_ashr_shl_extract_sitofp(<2 x i64> %a0) nounwind {
; X86-LABEL: signbits_ashr_shl_extract_sitofp:
; X86: # %bb.0:
; X86-NEXT: pushl %eax
-; X86-NEXT: vpsrad $31, %xmm0, %xmm1
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X86-NEXT: vpsrad $29, %xmm0, %xmm0
-; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; X86-NEXT: vpsllq $20, %xmm0, %xmm0
; X86-NEXT: vcvtdq2ps %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
@@ -203,25 +201,13 @@ define float @signbits_ashr_shl_extract_sitofp(<2 x i64> %a0) nounwind {
; X86-NEXT: popl %eax
; X86-NEXT: retl
;
-; X64-AVX1-LABEL: signbits_ashr_shl_extract_sitofp:
-; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-AVX1-NEXT: vpsrad $29, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; X64-AVX1-NEXT: vpsllq $20, %xmm0, %xmm0
-; X64-AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX2-LABEL: signbits_ashr_shl_extract_sitofp:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-AVX2-NEXT: vpsrad $29, %xmm0, %xmm0
-; X64-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; X64-AVX2-NEXT: vpsllq $20, %xmm0, %xmm0
-; X64-AVX2-NEXT: vcvtdq2ps %xmm0, %xmm0
-; X64-AVX2-NEXT: retq
+; X64-LABEL: signbits_ashr_shl_extract_sitofp:
+; X64: # %bb.0:
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT: vpsrad $29, %xmm0, %xmm0
+; X64-NEXT: vpsllq $20, %xmm0, %xmm0
+; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
+; X64-NEXT: retq
%1 = ashr <2 x i64> %a0, <i64 61, i64 60>
%2 = shl <2 x i64> %1, <i64 20, i64 16>
%3 = extractelement <2 x i64> %2, i32 0
@@ -473,10 +459,8 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
;
; X64-AVX2-LABEL: signbits_ashr_sext_select_shuffle_sitofp:
; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vpsrad $31, %ymm2, %ymm4
; X64-AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
; X64-AVX2-NEXT: vpsrad $1, %ymm2, %ymm2
-; X64-AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7]
; X64-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
; X64-AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm3, %ymm0
diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
index 1b0fd127a9ffa..53dbeccbb0e26 100644
--- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
+++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
@@ -134,7 +134,7 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) {
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,954437177,954437177]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,2147483648,1]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,2147483648,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; SSE2-NEXT: psrlq $32, %xmm1
@@ -149,7 +149,7 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) {
; SSE4: # %bb.0:
; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,3435973837,2863311531,954437177]
-; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [1,1,2147483648,1]
+; SSE4-NEXT: pmovzxdq {{.*#+}} xmm1 = [1,2147483648]
; SSE4-NEXT: pmuludq %xmm0, %xmm1
; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; SSE4-NEXT: psrlq $32, %xmm1
diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll
index 85497833a95f3..ead7110ae5790 100644
--- a/llvm/test/CodeGen/X86/pmulh.ll
+++ b/llvm/test/CodeGen/X86/pmulh.ll
@@ -54,9 +54,9 @@ define <4 x i16> @and_mulhuw_v4i16(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-LABEL: and_mulhuw_v4i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
@@ -65,8 +65,6 @@ define <4 x i16> @and_mulhuw_v4i16(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512-LABEL: and_mulhuw_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -2381,9 +2379,9 @@ define <4 x i16> @and_mulhuw_v4i16_shift17(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-LABEL: and_mulhuw_v4i16_shift17:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
@@ -2393,8 +2391,6 @@ define <4 x i16> @and_mulhuw_v4i16_shift17(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512-LABEL: and_mulhuw_v4i16_shift17:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm0
@@ -2441,9 +2437,9 @@ define <4 x i16> @and_mulhuw_v4i16_shift24(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-LABEL: and_mulhuw_v4i16_shift24:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
@@ -2453,8 +2449,6 @@ define <4 x i16> @and_mulhuw_v4i16_shift24(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512-LABEL: and_mulhuw_v4i16_shift24:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm0
@@ -2501,9 +2495,9 @@ define <4 x i16> @and_mulhuw_v4i16_shift31(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-LABEL: and_mulhuw_v4i16_shift31:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
@@ -2513,8 +2507,6 @@ define <4 x i16> @and_mulhuw_v4i16_shift31(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512-LABEL: and_mulhuw_v4i16_shift31:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vpsrlw $15, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/pr42727.ll b/llvm/test/CodeGen/X86/pr42727.ll
index f77497853ea16..18e884bee0f3d 100644
--- a/llvm/test/CodeGen/X86/pr42727.ll
+++ b/llvm/test/CodeGen/X86/pr42727.ll
@@ -7,7 +7,7 @@ define void @_ZN14simd_test_avx216c_imm_v256_alignILi1EEE6c_v256S1_S1_(ptr byval
; CHECK-LABEL: _ZN14simd_test_avx216c_imm_v256_alignILi1EEE6c_v256S1_S1_:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vmovdqu {{[0-9]+}}(%esp), %xmm0
-; CHECK-NEXT: vpbroadcastd (%eax), %xmm1
+; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; CHECK-NEXT: vpsllq $56, %ymm0, %ymm0
; CHECK-NEXT: vmovdqu %ymm0, (%eax)
diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll
index fdf43f5ab11f9..7d0ec6435d863 100644
--- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll
+++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll
@@ -220,12 +220,10 @@ define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
; X86-NEXT: vmovd %eax, %xmm0
-; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: calll __udivdi3
; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
-; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index 8f344490b66b7..a912301985132 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -1864,9 +1864,11 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
; X86-SSE-NEXT: movl c, %edx
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [65536,65536,u,u]
-; X86-SSE-NEXT: psllq $32, %xmm0
-; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT: movl $65536, %ecx # imm = 0x10000
+; X86-SSE-NEXT: movd %ecx, %xmm1
+; X86-SSE-NEXT: pmuludq %xmm0, %xmm1
+; X86-SSE-NEXT: psllq $32, %xmm1
+; X86-SSE-NEXT: movq %xmm1, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: mul_2xi16_varconst3:
@@ -1885,9 +1887,11 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
; X64-SSE-NEXT: movq c(%rip), %rax
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [65536,65536,u,u]
-; X64-SSE-NEXT: psllq $32, %xmm0
-; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT: movl $65536, %ecx # imm = 0x10000
+; X64-SSE-NEXT: movd %ecx, %xmm1
+; X64-SSE-NEXT: pmuludq %xmm0, %xmm1
+; X64-SSE-NEXT: psllq $32, %xmm1
+; X64-SSE-NEXT: movq %xmm1, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi16_varconst3:
@@ -1923,9 +1927,11 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X86-SSE-NEXT: psrad $16, %xmm0
-; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [32768,32768,u,u]
-; X86-SSE-NEXT: psllq $32, %xmm0
-; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT: movl $32768, %ecx # imm = 0x8000
+; X86-SSE-NEXT: movd %ecx, %xmm1
+; X86-SSE-NEXT: pmuludq %xmm0, %xmm1
+; X86-SSE-NEXT: psllq $32, %xmm1
+; X86-SSE-NEXT: movq %xmm1, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: mul_2xi16_varconst4:
@@ -1945,9 +1951,11 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-SSE-NEXT: psrad $16, %xmm0
-; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,32768,u,u]
-; X64-SSE-NEXT: psllq $32, %xmm0
-; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT: movl $32768, %ecx # imm = 0x8000
+; X64-SSE-NEXT: movd %ecx, %xmm1
+; X64-SSE-NEXT: pmuludq %xmm0, %xmm1
+; X64-SSE-NEXT: psllq $32, %xmm1
+; X64-SSE-NEXT: movq %xmm1, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi16_varconst4:
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 00d954c13ace0..3f54ea3c167d1 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -534,14 +534,14 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,u,1,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,268435456,1]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psrlq $32, %xmm0
@@ -555,7 +555,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1]
+; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm1
@@ -570,7 +570,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,1,268435456,1]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,268435456,u]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -611,7 +611,7 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,1,3067833783]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,u,1,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
@@ -1018,7 +1018,7 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pand %xmm0, %xmm2
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,u,1,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -1098,7 +1098,7 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_INT_MIN:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,1,3067833783]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,u,1,u]
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
@@ -1340,7 +1340,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,268435456,1]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psrlq $32, %xmm0
@@ -1354,7 +1354,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,1,3435973837]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1]
+; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm1
@@ -1369,7 +1369,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,3435973837]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,1,268435456,1]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,268435456,u]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -2067,13 +2067,12 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,1,0]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
-; CHECK-SSE2-NEXT: pand %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,u,1,u]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,268435456,1]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm1
-; CHECK-SSE2-NEXT: psrlq $32, %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: psrlq $32, %xmm1
+; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: por %xmm1, %xmm0
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2082,9 +2081,9 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
;
; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,1,0]
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,u,1,u]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,268435456,1]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm0
@@ -2097,9 +2096,9 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
;
; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,0]
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,u,1,u]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,1,268435456,1]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,u,268435456,u]
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
@@ -2139,13 +2138,12 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,0,1,0]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
-; CHECK-SSE2-NEXT: pand %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,u,1,u]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,268435456,1]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm1
-; CHECK-SSE2-NEXT: psrlq $32, %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,u,268435456,u]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: psrlq $32, %xmm1
+; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: por %xmm1, %xmm0
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2154,9 +2152,9 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
;
; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,0,1,0]
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,u,1,u]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,268435456,1]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,u,268435456,u]
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm0
@@ -2169,9 +2167,9 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
;
; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,0,1,0]
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,u,1,u]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,1,268435456,1]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,u,268435456,u]
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
index 7fe5a472c9fdc..06ebd8220cdd3 100644
--- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
@@ -276,63 +276,63 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; X64-LABEL: vec_v8i16:
; X64: # %bb.0:
-; X64-NEXT: pxor %xmm3, %xmm3
; X64-NEXT: movdqa %xmm1, %xmm2
-; X64-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; X64-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
; X64-NEXT: pslld $23, %xmm2
-; X64-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; X64-NEXT: paddd %xmm4, %xmm2
+; X64-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; X64-NEXT: paddd %xmm3, %xmm2
; X64-NEXT: cvttps2dq %xmm2, %xmm2
; X64-NEXT: pslld $16, %xmm2
; X64-NEXT: psrad $16, %xmm2
-; X64-NEXT: movdqa %xmm1, %xmm5
-; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; X64-NEXT: pslld $23, %xmm5
-; X64-NEXT: paddd %xmm4, %xmm5
-; X64-NEXT: cvttps2dq %xmm5, %xmm4
-; X64-NEXT: pslld $16, %xmm4
-; X64-NEXT: psrad $16, %xmm4
-; X64-NEXT: packssdw %xmm2, %xmm4
-; X64-NEXT: pmullw %xmm0, %xmm4
+; X64-NEXT: movdqa %xmm1, %xmm4
+; X64-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
+; X64-NEXT: pslld $23, %xmm4
+; X64-NEXT: paddd %xmm3, %xmm4
+; X64-NEXT: cvttps2dq %xmm4, %xmm3
+; X64-NEXT: pslld $16, %xmm3
+; X64-NEXT: psrad $16, %xmm3
+; X64-NEXT: packssdw %xmm2, %xmm3
+; X64-NEXT: pmullw %xmm0, %xmm3
; X64-NEXT: psllw $12, %xmm1
; X64-NEXT: movdqa %xmm1, %xmm2
; X64-NEXT: psraw $15, %xmm2
-; X64-NEXT: movdqa %xmm4, %xmm5
-; X64-NEXT: psraw $8, %xmm5
-; X64-NEXT: pand %xmm2, %xmm5
-; X64-NEXT: pandn %xmm4, %xmm2
-; X64-NEXT: por %xmm5, %xmm2
+; X64-NEXT: movdqa %xmm3, %xmm4
+; X64-NEXT: psraw $8, %xmm4
+; X64-NEXT: pand %xmm2, %xmm4
+; X64-NEXT: pandn %xmm3, %xmm2
+; X64-NEXT: por %xmm4, %xmm2
; X64-NEXT: paddw %xmm1, %xmm1
-; X64-NEXT: movdqa %xmm1, %xmm5
-; X64-NEXT: psraw $15, %xmm5
-; X64-NEXT: movdqa %xmm5, %xmm6
-; X64-NEXT: pandn %xmm2, %xmm6
+; X64-NEXT: movdqa %xmm1, %xmm4
+; X64-NEXT: psraw $15, %xmm4
+; X64-NEXT: movdqa %xmm4, %xmm5
+; X64-NEXT: pandn %xmm2, %xmm5
; X64-NEXT: psraw $4, %xmm2
-; X64-NEXT: pand %xmm5, %xmm2
-; X64-NEXT: por %xmm6, %xmm2
+; X64-NEXT: pand %xmm4, %xmm2
+; X64-NEXT: por %xmm5, %xmm2
; X64-NEXT: paddw %xmm1, %xmm1
-; X64-NEXT: movdqa %xmm1, %xmm5
-; X64-NEXT: psraw $15, %xmm5
-; X64-NEXT: movdqa %xmm5, %xmm6
-; X64-NEXT: pandn %xmm2, %xmm6
+; X64-NEXT: movdqa %xmm1, %xmm4
+; X64-NEXT: psraw $15, %xmm4
+; X64-NEXT: movdqa %xmm4, %xmm5
+; X64-NEXT: pandn %xmm2, %xmm5
; X64-NEXT: psraw $2, %xmm2
-; X64-NEXT: pand %xmm5, %xmm2
-; X64-NEXT: por %xmm6, %xmm2
+; X64-NEXT: pand %xmm4, %xmm2
+; X64-NEXT: por %xmm5, %xmm2
; X64-NEXT: paddw %xmm1, %xmm1
; X64-NEXT: psraw $15, %xmm1
-; X64-NEXT: movdqa %xmm1, %xmm5
-; X64-NEXT: pandn %xmm2, %xmm5
+; X64-NEXT: movdqa %xmm1, %xmm4
+; X64-NEXT: pandn %xmm2, %xmm4
; X64-NEXT: psraw $1, %xmm2
; X64-NEXT: pand %xmm1, %xmm2
-; X64-NEXT: por %xmm5, %xmm2
+; X64-NEXT: por %xmm4, %xmm2
; X64-NEXT: pcmpeqw %xmm0, %xmm2
-; X64-NEXT: pand %xmm2, %xmm4
-; X64-NEXT: pcmpgtw %xmm0, %xmm3
-; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; X64-NEXT: pand %xmm2, %xmm3
+; X64-NEXT: pxor %xmm1, %xmm1
+; X64-NEXT: pcmpgtw %xmm0, %xmm1
+; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT: por %xmm3, %xmm0
+; X64-NEXT: por %xmm1, %xmm0
; X64-NEXT: pandn %xmm0, %xmm2
-; X64-NEXT: por %xmm4, %xmm2
+; X64-NEXT: por %xmm3, %xmm2
; X64-NEXT: movdqa %xmm2, %xmm0
; X64-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
index 6d57af253142a..759055d284d12 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
@@ -136,15 +136,17 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; SSE2-NEXT: movd %edx, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [683,1463,819,u]
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [683,u,819,u]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: movl $1463, %eax # imm = 0x5B7
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047]
-; SSE2-NEXT: xorps %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: psrld $1, %xmm0
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index 38d290e036963..3ddfb2aa83c75 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -480,7 +480,7 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,268435456,1]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psrlq $32, %xmm0
@@ -493,7 +493,7 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1]
+; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm1
@@ -507,7 +507,7 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,1,268435456,1]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,268435456,u]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -922,7 +922,7 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,2,1]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,2,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psrlq $32, %xmm0
@@ -935,7 +935,7 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE41-LABEL: test_urem_odd_INT_MIN:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
-; CHECK-SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [1,1,2,1]
+; CHECK-SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,2]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm1
@@ -949,7 +949,7 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: test_urem_odd_INT_MIN:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,1,2,1]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,2,u]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1162,7 +1162,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,4294967295,3435973837,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,268435456,1]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psrlq $32, %xmm0
@@ -1175,7 +1175,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,3435973837]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1]
+; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm1
@@ -1189,7 +1189,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,3435973837]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,1,268435456,1]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,268435456,u]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1832,7 +1832,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,4294967295,0,0]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,268435456,1]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psrlq $32, %xmm0
@@ -1845,7 +1845,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,0]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1]
+; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm1
@@ -1859,7 +1859,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,0]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,1,268435456,1]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,268435456,u]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1902,7 +1902,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,4294967295,0,0]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,268435456,1]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,u,268435456,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psrlq $32, %xmm0
@@ -1915,7 +1915,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,1,0]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,1,268435456,1]
+; CHECK-SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = [2147483648,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm1
@@ -1929,7 +1929,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,1,0]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [2147483648,1,268435456,1]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [2147483648,u,268435456,u]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
index e26ccc1ee471e..b8e83da9cf361 100644
--- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
@@ -222,23 +222,22 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; X64-LABEL: vec_v8i16:
; X64: # %bb.0:
-; X64-NEXT: pxor %xmm2, %xmm2
-; X64-NEXT: movdqa %xmm1, %xmm3
-; X64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; X64-NEXT: pslld $23, %xmm3
-; X64-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; X64-NEXT: paddd %xmm4, %xmm3
-; X64-NEXT: cvttps2dq %xmm3, %xmm3
-; X64-NEXT: pslld $16, %xmm3
-; X64-NEXT: psrad $16, %xmm3
-; X64-NEXT: movdqa %xmm1, %xmm5
-; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; X64-NEXT: pslld $23, %xmm5
-; X64-NEXT: paddd %xmm4, %xmm5
-; X64-NEXT: cvttps2dq %xmm5, %xmm2
+; X64-NEXT: movdqa %xmm1, %xmm2
+; X64-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; X64-NEXT: pslld $23, %xmm2
+; X64-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; X64-NEXT: paddd %xmm3, %xmm2
+; X64-NEXT: cvttps2dq %xmm2, %xmm4
+; X64-NEXT: pslld $16, %xmm4
+; X64-NEXT: psrad $16, %xmm4
+; X64-NEXT: movdqa %xmm1, %xmm2
+; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
+; X64-NEXT: pslld $23, %xmm2
+; X64-NEXT: paddd %xmm3, %xmm2
+; X64-NEXT: cvttps2dq %xmm2, %xmm2
; X64-NEXT: pslld $16, %xmm2
; X64-NEXT: psrad $16, %xmm2
-; X64-NEXT: packssdw %xmm3, %xmm2
+; X64-NEXT: packssdw %xmm4, %xmm2
; X64-NEXT: pmullw %xmm0, %xmm2
; X64-NEXT: psllw $12, %xmm1
; X64-NEXT: movdqa %xmm1, %xmm3
diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 6140932d4a11e..1a1a50689c87f 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -435,12 +435,12 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE2-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero
; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm1[0]
-; SSE2-NEXT: movd %r9d, %xmm3
-; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT: movd %r9d, %xmm2
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: pxor %xmm5, %xmm5
@@ -467,26 +467,23 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
; SSE2-NEXT: pxor %xmm5, %xmm6
; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm7
-; SSE2-NEXT: pand %xmm3, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm7
+; SSE2-NEXT: pand %xmm2, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT: pand %xmm1, %xmm4
; SSE2-NEXT: paddd %xmm7, %xmm4
-; SSE2-NEXT: pmuludq %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,0,0]
-; SSE2-NEXT: pmuludq %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE2-NEXT: psubd %xmm4, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE2-NEXT: pmuludq %xmm7, %xmm0
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT: psubd %xmm4, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: movq %xmm1, 16(%rcx)
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm5, %xmm1
; SSE2-NEXT: movq %xmm1, 16(%rdi)
; SSE2-NEXT: movdqa %xmm6, (%rdi)
@@ -509,12 +506,12 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm1[0]
-; SSSE3-NEXT: movd %r9d, %xmm3
-; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSSE3-NEXT: movd %r9d, %xmm2
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; SSSE3-NEXT: pxor %xmm4, %xmm4
; SSSE3-NEXT: pxor %xmm5, %xmm5
@@ -541,26 +538,23 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5
; SSSE3-NEXT: pxor %xmm5, %xmm6
; SSSE3-NEXT: pxor %xmm7, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm7
-; SSSE3-NEXT: pand %xmm3, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
-; SSSE3-NEXT: pand %xmm2, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7
+; SSSE3-NEXT: pand %xmm2, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT: pand %xmm1, %xmm4
; SSSE3-NEXT: paddd %xmm7, %xmm4
-; SSSE3-NEXT: pmuludq %xmm3, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,0,0]
-; SSSE3-NEXT: pmuludq %xmm0, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSSE3-NEXT: psubd %xmm4, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSSE3-NEXT: pmuludq %xmm7, %xmm0
+; SSSE3-NEXT: pmuludq %xmm2, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; SSSE3-NEXT: pmuludq %xmm3, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSSE3-NEXT: psubd %xmm4, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSSE3-NEXT: movq %xmm1, 16(%rcx)
; SSSE3-NEXT: psrad $31, %xmm1
-; SSSE3-NEXT: pcmpeqd %xmm3, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
; SSSE3-NEXT: pxor %xmm5, %xmm1
; SSSE3-NEXT: movq %xmm1, 16(%rdi)
; SSSE3-NEXT: movdqa %xmm6, (%rdi)
diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index 366df4137bbd4..1df40e773246a 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -366,9 +366,9 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSE2-NEXT: movd %r8d, %xmm0
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -376,42 +376,37 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE2-NEXT: movd %esi, %xmm3
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
-; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: movd %r9d, %xmm0
-; SSE2-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
-; SSE2-NEXT: pmuludq %xmm2, %xmm0
+; SSE2-NEXT: movd %r9d, %xmm1
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm4, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm6, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm7
-; SSE2-NEXT: pxor %xmm7, %xmm2
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm4, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
+; SSE2-NEXT: pxor %xmm5, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,0,0]
-; SSE2-NEXT: pmuludq %xmm1, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
-; SSE2-NEXT: pxor %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE2-NEXT: pmuludq %xmm8, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; SSE2-NEXT: pmuludq %xmm2, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm7
+; SSE2-NEXT: pxor %xmm5, %xmm7
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
; SSE2-NEXT: movq %xmm0, 16(%rcx)
; SSE2-NEXT: movdqa %xmm3, (%rcx)
-; SSE2-NEXT: movq %xmm4, 16(%rdi)
-; SSE2-NEXT: movdqa %xmm2, (%rdi)
+; SSE2-NEXT: movq %xmm7, 16(%rdi)
+; SSE2-NEXT: movdqa %xmm1, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: umulo_v6i32:
@@ -421,9 +416,9 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSSE3-NEXT: movd %r8d, %xmm0
; SSSE3-NEXT: movd %ecx, %xmm1
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -431,42 +426,37 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: movd %esi, %xmm3
; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
-; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSSE3-NEXT: movd %r9d, %xmm0
-; SSSE3-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
-; SSSE3-NEXT: pmuludq %xmm2, %xmm0
+; SSSE3-NEXT: movd %r9d, %xmm1
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pmuludq %xmm4, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pmuludq %xmm6, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
-; SSSE3-NEXT: pxor %xmm6, %xmm6
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7
-; SSSE3-NEXT: pxor %xmm7, %xmm2
+; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: pmuludq %xmm1, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pmuludq %xmm2, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pmuludq %xmm4, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSSE3-NEXT: pxor %xmm4, %xmm4
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5
+; SSSE3-NEXT: pxor %xmm5, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,0,0]
-; SSSE3-NEXT: pmuludq %xmm1, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4
-; SSSE3-NEXT: pxor %xmm7, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSSE3-NEXT: pmuludq %xmm8, %xmm1
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; SSSE3-NEXT: pmuludq %xmm2, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm7
+; SSSE3-NEXT: pxor %xmm5, %xmm7
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
; SSSE3-NEXT: movq %xmm0, 16(%rcx)
; SSSE3-NEXT: movdqa %xmm3, (%rcx)
-; SSSE3-NEXT: movq %xmm4, 16(%rdi)
-; SSSE3-NEXT: movdqa %xmm2, (%rdi)
+; SSSE3-NEXT: movq %xmm7, 16(%rdi)
+; SSSE3-NEXT: movdqa %xmm1, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: umulo_v6i32:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index fe36fbf626d70..db7255079fb1f 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -428,25 +428,24 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-NEXT: pslld $23, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT: paddd %xmm6, %xmm5
-; SSE2-NEXT: cvttps2dq %xmm5, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm5, %xmm3
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $23, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT: paddd %xmm5, %xmm4
+; SSE2-NEXT: cvttps2dq %xmm4, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm4, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm7, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm6, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; SSE2-NEXT: psrad $16, %xmm3
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $23, %xmm2
-; SSE2-NEXT: paddd %xmm6, %xmm2
+; SSE2-NEXT: paddd %xmm5, %xmm2
; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm1
@@ -464,19 +463,18 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; SSE41-NEXT: movdqa %xmm1, %xmm3
; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
; SSE41-NEXT: pslld $23, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; SSE41-NEXT: paddd %xmm4, %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT: paddd %xmm5, %xmm2
; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
; SSE41-NEXT: pmulld %xmm3, %xmm2
; SSE41-NEXT: psrld $16, %xmm2
; SSE41-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE41-NEXT: pslld $23, %xmm5
-; SSE41-NEXT: paddd %xmm4, %xmm5
-; SSE41-NEXT: cvttps2dq %xmm5, %xmm0
+; SSE41-NEXT: pslld $23, %xmm4
+; SSE41-NEXT: paddd %xmm5, %xmm4
+; SSE41-NEXT: cvttps2dq %xmm4, %xmm0
; SSE41-NEXT: pmulld %xmm1, %xmm0
; SSE41-NEXT: psrld $16, %xmm0
; SSE41-NEXT: packusdw %xmm2, %xmm0
@@ -486,8 +484,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; AVX1: # %bb.0:
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
@@ -618,25 +615,24 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT: pxor %xmm4, %xmm4
-; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
-; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; X86-SSE2-NEXT: pslld $23, %xmm6
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
-; X86-SSE2-NEXT: paddd %xmm5, %xmm6
-; X86-SSE2-NEXT: cvttps2dq %xmm6, %xmm6
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq %xmm6, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT: pslld $23, %xmm5
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; X86-SSE2-NEXT: paddd %xmm4, %xmm5
+; X86-SSE2-NEXT: cvttps2dq %xmm5, %xmm5
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: pmuludq %xmm5, %xmm3
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq %xmm7, %xmm6
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT: pmuludq %xmm6, %xmm5
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
; X86-SSE2-NEXT: psrad $16, %xmm3
; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; X86-SSE2-NEXT: pslld $23, %xmm2
-; X86-SSE2-NEXT: paddd %xmm5, %xmm2
+; X86-SSE2-NEXT: paddd %xmm4, %xmm2
; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1
@@ -656,47 +652,47 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
; SSE2-LABEL: var_funnnel_v16i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pslld $23, %xmm6
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT: paddd %xmm4, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT: paddd %xmm3, %xmm6
; SSE2-NEXT: cvttps2dq %xmm6, %xmm6
; SSE2-NEXT: pslld $16, %xmm6
; SSE2-NEXT: psrad $16, %xmm6
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; SSE2-NEXT: pslld $23, %xmm5
-; SSE2-NEXT: paddd %xmm4, %xmm5
-; SSE2-NEXT: cvttps2dq %xmm5, %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $23, %xmm4
+; SSE2-NEXT: paddd %xmm3, %xmm4
+; SSE2-NEXT: cvttps2dq %xmm4, %xmm7
; SSE2-NEXT: pslld $16, %xmm7
; SSE2-NEXT: psrad $16, %xmm7
; SSE2-NEXT: packssdw %xmm6, %xmm7
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; SSE2-NEXT: pmullw %xmm7, %xmm5
-; SSE2-NEXT: psrlw $8, %xmm5
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
-; SSE2-NEXT: pslld $23, %xmm6
-; SSE2-NEXT: paddd %xmm4, %xmm6
-; SSE2-NEXT: cvttps2dq %xmm6, %xmm6
-; SSE2-NEXT: pslld $16, %xmm6
-; SSE2-NEXT: psrad $16, %xmm6
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; SSE2-NEXT: pmullw %xmm7, %xmm4
+; SSE2-NEXT: psrlw $8, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $23, %xmm5
+; SSE2-NEXT: paddd %xmm3, %xmm5
+; SSE2-NEXT: cvttps2dq %xmm5, %xmm5
+; SSE2-NEXT: pslld $16, %xmm5
+; SSE2-NEXT: psrad $16, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $23, %xmm2
-; SSE2-NEXT: paddd %xmm4, %xmm2
+; SSE2-NEXT: paddd %xmm3, %xmm2
; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
; SSE2-NEXT: pslld $16, %xmm2
; SSE2-NEXT: psrad $16, %xmm2
-; SSE2-NEXT: packssdw %xmm6, %xmm2
+; SSE2-NEXT: packssdw %xmm5, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: pmullw %xmm1, %xmm2
; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: packuswb %xmm5, %xmm2
+; SSE2-NEXT: packuswb %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
@@ -707,32 +703,32 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
; SSE41-NEXT: pslld $23, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216]
-; SSE41-NEXT: paddd %xmm7, %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT: paddd %xmm6, %xmm2
; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE41-NEXT: pslld $23, %xmm6
-; SSE41-NEXT: paddd %xmm7, %xmm6
-; SSE41-NEXT: cvttps2dq %xmm6, %xmm6
-; SSE41-NEXT: packusdw %xmm2, %xmm6
-; SSE41-NEXT: movdqa %xmm1, %xmm8
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
-; SSE41-NEXT: pmullw %xmm6, %xmm8
-; SSE41-NEXT: psrlw $8, %xmm8
+; SSE41-NEXT: pslld $23, %xmm3
+; SSE41-NEXT: paddd %xmm6, %xmm3
+; SSE41-NEXT: cvttps2dq %xmm3, %xmm3
+; SSE41-NEXT: packusdw %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm7
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
+; SSE41-NEXT: pmullw %xmm3, %xmm7
+; SSE41-NEXT: psrlw $8, %xmm7
; SSE41-NEXT: pslld $23, %xmm4
-; SSE41-NEXT: paddd %xmm7, %xmm4
+; SSE41-NEXT: paddd %xmm6, %xmm4
; SSE41-NEXT: cvttps2dq %xmm4, %xmm2
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
; SSE41-NEXT: pslld $23, %xmm5
-; SSE41-NEXT: paddd %xmm7, %xmm5
+; SSE41-NEXT: paddd %xmm6, %xmm5
; SSE41-NEXT: cvttps2dq %xmm5, %xmm3
; SSE41-NEXT: packusdw %xmm3, %xmm2
; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE41-NEXT: pmullw %xmm1, %xmm2
; SSE41-NEXT: psrlw $8, %xmm2
-; SSE41-NEXT: packuswb %xmm8, %xmm2
+; SSE41-NEXT: packuswb %xmm7, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
@@ -740,34 +736,34 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
; AVX1: # %bb.0:
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
-; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
-; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
+; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
+; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
-; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
-; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_funnnel_v16i8:
@@ -919,47 +915,47 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
; X86-SSE2-LABEL: var_funnnel_v16i8:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT: pxor %xmm3, %xmm3
-; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
-; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
-; X86-SSE2-NEXT: movdqa %xmm5, %xmm6
-; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
+; X86-SSE2-NEXT: pxor %xmm5, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm6
+; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
; X86-SSE2-NEXT: pslld $23, %xmm6
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; X86-SSE2-NEXT: paddd %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; X86-SSE2-NEXT: paddd %xmm3, %xmm6
; X86-SSE2-NEXT: cvttps2dq %xmm6, %xmm6
; X86-SSE2-NEXT: pslld $16, %xmm6
; X86-SSE2-NEXT: psrad $16, %xmm6
-; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; X86-SSE2-NEXT: pslld $23, %xmm5
-; X86-SSE2-NEXT: paddd %xmm4, %xmm5
-; X86-SSE2-NEXT: cvttps2dq %xmm5, %xmm7
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
+; X86-SSE2-NEXT: pslld $23, %xmm4
+; X86-SSE2-NEXT: paddd %xmm3, %xmm4
+; X86-SSE2-NEXT: cvttps2dq %xmm4, %xmm7
; X86-SSE2-NEXT: pslld $16, %xmm7
; X86-SSE2-NEXT: psrad $16, %xmm7
; X86-SSE2-NEXT: packssdw %xmm6, %xmm7
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
-; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; X86-SSE2-NEXT: pmullw %xmm7, %xmm5
-; X86-SSE2-NEXT: psrlw $8, %xmm5
-; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
-; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
-; X86-SSE2-NEXT: pslld $23, %xmm6
-; X86-SSE2-NEXT: paddd %xmm4, %xmm6
-; X86-SSE2-NEXT: cvttps2dq %xmm6, %xmm6
-; X86-SSE2-NEXT: pslld $16, %xmm6
-; X86-SSE2-NEXT: psrad $16, %xmm6
-; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; X86-SSE2-NEXT: pmullw %xmm7, %xmm4
+; X86-SSE2-NEXT: psrlw $8, %xmm4
+; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT: pslld $23, %xmm5
+; X86-SSE2-NEXT: paddd %xmm3, %xmm5
+; X86-SSE2-NEXT: cvttps2dq %xmm5, %xmm5
+; X86-SSE2-NEXT: pslld $16, %xmm5
+; X86-SSE2-NEXT: psrad $16, %xmm5
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; X86-SSE2-NEXT: pslld $23, %xmm2
-; X86-SSE2-NEXT: paddd %xmm4, %xmm2
+; X86-SSE2-NEXT: paddd %xmm3, %xmm2
; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2
; X86-SSE2-NEXT: pslld $16, %xmm2
; X86-SSE2-NEXT: psrad $16, %xmm2
-; X86-SSE2-NEXT: packssdw %xmm6, %xmm2
+; X86-SSE2-NEXT: packssdw %xmm5, %xmm2
; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; X86-SSE2-NEXT: pmullw %xmm1, %xmm2
; X86-SSE2-NEXT: psrlw $8, %xmm2
-; X86-SSE2-NEXT: packuswb %xmm5, %xmm2
+; X86-SSE2-NEXT: packuswb %xmm4, %xmm2
; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
; X86-SSE2-NEXT: retl
%res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index 74e46177b1bf1..2fadf5f101626 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -319,33 +319,32 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; AVX1-NEXT: vpslld $23, %xmm8, %xmm8
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm9 = [1065353216,1065353216,1065353216,1065353216]
-; AVX1-NEXT: vpaddd %xmm9, %xmm8, %xmm8
-; AVX1-NEXT: vcvttps2dq %xmm8, %xmm8
-; AVX1-NEXT: vpmulld %xmm8, %xmm5, %xmm5
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $23, %xmm7, %xmm7
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7
+; AVX1-NEXT: vpmulld %xmm7, %xmm5, %xmm5
; AVX1-NEXT: vpsrld $16, %xmm5, %xmm5
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
-; AVX1-NEXT: vpaddd %xmm4, %xmm9, %xmm4
+; AVX1-NEXT: vpaddd %xmm4, %xmm8, %xmm4
; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
; AVX1-NEXT: vpmulld %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3
; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
-; AVX1-NEXT: vpaddd %xmm5, %xmm9, %xmm5
+; AVX1-NEXT: vpaddd %xmm5, %xmm8, %xmm5
; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpsrld $16, %xmm4, %xmm4
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm1, %xmm9, %xmm1
+; AVX1-NEXT: vpaddd %xmm1, %xmm8, %xmm1
; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
@@ -480,17 +479,17 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
; AVX1-LABEL: var_funnnel_v32i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
-; AVX1-NEXT: vpslld $23, %xmm4, %xmm7
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $23, %xmm3, %xmm7
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vpaddd %xmm3, %xmm7, %xmm7
; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
-; AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6
+; AVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6
; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
; AVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
@@ -498,48 +497,48 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
; AVX1-NEXT: vpmullw %xmm6, %xmm9, %xmm6
; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
; AVX1-NEXT: vpslld $23, %xmm9, %xmm9
-; AVX1-NEXT: vpaddd %xmm4, %xmm9, %xmm9
+; AVX1-NEXT: vpaddd %xmm3, %xmm9, %xmm9
; AVX1-NEXT: vcvttps2dq %xmm9, %xmm9
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
+; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4
+; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
+; AVX1-NEXT: vpackusdw %xmm4, %xmm9, %xmm4
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX1-NEXT: vpmullw %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
+; AVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6
+; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
-; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5
+; AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm5
; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
-; AVX1-NEXT: vpackusdw %xmm5, %xmm9, %xmm5
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
-; AVX1-NEXT: vpmullw %xmm5, %xmm7, %xmm5
+; AVX1-NEXT: vpackusdw %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT: vpackuswb %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
-; AVX1-NEXT: vpslld $23, %xmm7, %xmm7
-; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7
-; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
-; AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6
+; AVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6
; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
-; AVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6
-; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX1-NEXT: vpslld $23, %xmm7, %xmm7
-; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7
-; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT: vpackuswb %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
index c6698be9bc15c..b0c225dd1ee0e 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
@@ -306,22 +306,21 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
; SSE2-LABEL: var_funnnel_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: pslld $23, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT: paddd %xmm4, %xmm3
-; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
-; SSE2-NEXT: pslld $16, %xmm3
-; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $23, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT: paddd %xmm3, %xmm2
+; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
+; SSE2-NEXT: pslld $16, %xmm2
+; SSE2-NEXT: psrad $16, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; SSE2-NEXT: pslld $16, %xmm1
; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: packssdw %xmm3, %xmm1
+; SSE2-NEXT: packssdw %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pmulhuw %xmm1, %xmm2
; SSE2-NEXT: pmullw %xmm1, %xmm0
@@ -331,16 +330,15 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
; SSE41-LABEL: var_funnnel_v8i16:
; SSE41: # %bb.0:
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
-; SSE41-NEXT: paddd %xmm2, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT: paddd %xmm3, %xmm1
; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pslld $23, %xmm3
-; SSE41-NEXT: paddd %xmm2, %xmm3
-; SSE41-NEXT: cvttps2dq %xmm3, %xmm2
+; SSE41-NEXT: pslld $23, %xmm2
+; SSE41-NEXT: paddd %xmm3, %xmm2
+; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
; SSE41-NEXT: packusdw %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmulhuw %xmm2, %xmm1
@@ -351,8 +349,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
; AVX1-LABEL: var_funnnel_v8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
@@ -456,18 +453,17 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
; X86-SSE2-LABEL: var_funnnel_v8i16:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT: pxor %xmm3, %xmm3
; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
; X86-SSE2-NEXT: pslld $23, %xmm2
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; X86-SSE2-NEXT: paddd %xmm4, %xmm2
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; X86-SSE2-NEXT: paddd %xmm3, %xmm2
; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2
; X86-SSE2-NEXT: pslld $16, %xmm2
; X86-SSE2-NEXT: psrad $16, %xmm2
-; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; X86-SSE2-NEXT: pslld $23, %xmm1
-; X86-SSE2-NEXT: paddd %xmm4, %xmm1
+; X86-SSE2-NEXT: paddd %xmm3, %xmm1
; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; X86-SSE2-NEXT: pslld $16, %xmm1
; X86-SSE2-NEXT: psrad $16, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
index 6c282973dc00e..d24b8d380ef21 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -234,29 +234,28 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
-; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpmulhuw %xmm2, %xmm5, %xmm7
-; AVX1-NEXT: vpmullw %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpmulhuw %xmm2, %xmm4, %xmm6
+; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
-; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 1f75aedb66492..a5efd829db000 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -460,24 +460,23 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; SSE2-NEXT: pand %xmm4, %xmm3
; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE2-NEXT: pslld $23, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT: paddd %xmm5, %xmm4
-; SSE2-NEXT: cvttps2dq %xmm4, %xmm4
-; SSE2-NEXT: pslld $16, %xmm4
-; SSE2-NEXT: psrad $16, %xmm4
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT: pslld $23, %xmm2
-; SSE2-NEXT: paddd %xmm5, %xmm2
-; SSE2-NEXT: cvttps2dq %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $23, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; SSE2-NEXT: pslld $16, %xmm1
; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: packssdw %xmm4, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $23, %xmm2
+; SSE2-NEXT: paddd %xmm4, %xmm2
+; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
+; SSE2-NEXT: pslld $16, %xmm2
+; SSE2-NEXT: psrad $16, %xmm2
+; SSE2-NEXT: packssdw %xmm1, %xmm2
; SSE2-NEXT: paddw %xmm0, %xmm0
-; SSE2-NEXT: pmullw %xmm1, %xmm0
+; SSE2-NEXT: pmullw %xmm2, %xmm0
; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: retq
;
@@ -511,16 +510,15 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
; SSE41-NEXT: pandn %xmm5, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
; SSE41-NEXT: pslld $23, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1065353216,1065353216,1065353216,1065353216]
-; SSE41-NEXT: paddd %xmm0, %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT: paddd %xmm4, %xmm2
; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE41-NEXT: pslld $23, %xmm4
-; SSE41-NEXT: paddd %xmm0, %xmm4
-; SSE41-NEXT: cvttps2dq %xmm4, %xmm0
+; SSE41-NEXT: pslld $23, %xmm0
+; SSE41-NEXT: paddd %xmm4, %xmm0
+; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
; SSE41-NEXT: packusdw %xmm2, %xmm0
; SSE41-NEXT: paddw %xmm3, %xmm3
; SSE41-NEXT: pmullw %xmm0, %xmm3
@@ -547,8 +545,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; AVX1-NEXT: vpsllw $3, %xmm4, %xmm4
; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
@@ -706,18 +703,17 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; X86-SSE2-NEXT: pand %xmm4, %xmm3
; X86-SSE2-NEXT: por %xmm1, %xmm3
; X86-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT: pxor %xmm4, %xmm4
; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; X86-SSE2-NEXT: pslld $23, %xmm1
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
-; X86-SSE2-NEXT: paddd %xmm5, %xmm1
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; X86-SSE2-NEXT: paddd %xmm4, %xmm1
; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; X86-SSE2-NEXT: pslld $16, %xmm1
; X86-SSE2-NEXT: psrad $16, %xmm1
-; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; X86-SSE2-NEXT: pslld $23, %xmm2
-; X86-SSE2-NEXT: paddd %xmm5, %xmm2
+; X86-SSE2-NEXT: paddd %xmm4, %xmm2
; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2
; X86-SSE2-NEXT: pslld $16, %xmm2
; X86-SSE2-NEXT: psrad $16, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 3c52efd51381d..217431be10d88 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -338,37 +338,36 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
; AVX1-NEXT: vpsllw $3, %xmm5, %xmm5
; AVX1-NEXT: vpblendvb %xmm5, %xmm7, %xmm6, %xmm5
; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm6
-; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; AVX1-NEXT: vpslld $23, %xmm4, %xmm8
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $23, %xmm4, %xmm7
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; AVX1-NEXT: vpaddd %xmm4, %xmm8, %xmm8
-; AVX1-NEXT: vcvttps2dq %xmm8, %xmm8
+; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7
+; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
; AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6
; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
-; AVX1-NEXT: vpackusdw %xmm8, %xmm6, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8
-; AVX1-NEXT: vpaddw %xmm8, %xmm8, %xmm8
-; AVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6
+; AVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT: vpaddw %xmm7, %xmm7, %xmm7
+; AVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6
; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpsllw $12, %xmm2, %xmm6
-; AVX1-NEXT: vpsllw $4, %xmm2, %xmm8
-; AVX1-NEXT: vpor %xmm6, %xmm8, %xmm6
-; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm8
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm9
-; AVX1-NEXT: vpblendvb %xmm6, %xmm9, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm9
-; AVX1-NEXT: vpblendvb %xmm8, %xmm9, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm8
-; AVX1-NEXT: vpsllw $2, %xmm6, %xmm9
-; AVX1-NEXT: vpblendvb %xmm9, %xmm8, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm8
-; AVX1-NEXT: vpsllw $3, %xmm6, %xmm6
+; AVX1-NEXT: vpsllw $4, %xmm2, %xmm7
+; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm6
+; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm7
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm8
; AVX1-NEXT: vpblendvb %xmm6, %xmm8, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm8
+; AVX1-NEXT: vpblendvb %xmm7, %xmm8, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm7
+; AVX1-NEXT: vpsllw $2, %xmm6, %xmm8
+; AVX1-NEXT: vpblendvb %xmm8, %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm7
+; AVX1-NEXT: vpsllw $3, %xmm6, %xmm6
+; AVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index 4c271a5ee797c..560a590a1b091 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -318,21 +318,20 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
; SSE2-LABEL: var_funnnel_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: psubw %xmm1, %xmm3
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: psubw %xmm1, %xmm2
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; SSE2-NEXT: pslld $16, %xmm1
; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-NEXT: pslld $23, %xmm3
-; SSE2-NEXT: paddd %xmm4, %xmm3
-; SSE2-NEXT: cvttps2dq %xmm3, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $23, %xmm2
+; SSE2-NEXT: paddd %xmm3, %xmm2
+; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
; SSE2-NEXT: pslld $16, %xmm2
; SSE2-NEXT: psrad $16, %xmm2
; SSE2-NEXT: packssdw %xmm1, %xmm2
@@ -345,19 +344,18 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
; SSE41-LABEL: var_funnnel_v8i16:
; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: psubw %xmm1, %xmm3
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE41-NEXT: pslld $23, %xmm3
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
-; SSE41-NEXT: paddd %xmm2, %xmm3
-; SSE41-NEXT: cvttps2dq %xmm3, %xmm3
+; SSE41-NEXT: psubw %xmm1, %xmm2
+; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; SSE41-NEXT: pslld $23, %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT: paddd %xmm3, %xmm2
+; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: paddd %xmm2, %xmm1
+; SSE41-NEXT: paddd %xmm3, %xmm1
; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: packusdw %xmm3, %xmm1
+; SSE41-NEXT: packusdw %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: pmulhuw %xmm1, %xmm2
; SSE41-NEXT: pmullw %xmm1, %xmm0
@@ -369,7 +367,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
@@ -474,21 +472,20 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
;
; X86-SSE2-LABEL: var_funnnel_v8i16:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pxor %xmm3, %xmm3
; X86-SSE2-NEXT: pxor %xmm2, %xmm2
; X86-SSE2-NEXT: psubw %xmm1, %xmm2
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; X86-SSE2-NEXT: pslld $23, %xmm1
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; X86-SSE2-NEXT: paddd %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; X86-SSE2-NEXT: paddd %xmm3, %xmm1
; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; X86-SSE2-NEXT: pslld $16, %xmm1
; X86-SSE2-NEXT: psrad $16, %xmm1
-; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; X86-SSE2-NEXT: pslld $23, %xmm2
-; X86-SSE2-NEXT: paddd %xmm4, %xmm2
+; X86-SSE2-NEXT: paddd %xmm3, %xmm2
; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2
; X86-SSE2-NEXT: pslld $16, %xmm2
; X86-SSE2-NEXT: psrad $16, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index c0b0446433bd8..3172ecb8ae08f 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -249,7 +249,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
; AVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
@@ -265,7 +265,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2
; AVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll
index 707536c683621..1b6fb8d499498 100644
--- a/llvm/test/CodeGen/X86/vector-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-mul.ll
@@ -1393,29 +1393,29 @@ define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind {
define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
; X86-SSE2-LABEL: mul_v2i64_neg_15_63:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967281,4294967295,4294967233,4294967295]
+; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-SSE2-NEXT: pmuludq %xmm0, %xmm1
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT: psrlq $32, %xmm3
-; X86-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [4294967295,0,4294967295,0]
-; X86-SSE2-NEXT: paddq %xmm3, %xmm0
-; X86-SSE2-NEXT: psllq $32, %xmm0
+; X86-SSE2-NEXT: psrlq $32, %xmm2
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967281,4294967295,4294967233,4294967295]
+; X86-SSE2-NEXT: pmuludq %xmm3, %xmm2
+; X86-SSE2-NEXT: paddq %xmm1, %xmm2
+; X86-SSE2-NEXT: psllq $32, %xmm2
+; X86-SSE2-NEXT: pmuludq %xmm3, %xmm0
; X86-SSE2-NEXT: paddq %xmm2, %xmm0
; X86-SSE2-NEXT: retl
;
; X86-SSE4-LABEL: mul_v2i64_neg_15_63:
; X86-SSE4: # %bb.0:
-; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553]
+; X86-SSE4-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-SSE4-NEXT: pmuludq %xmm0, %xmm1
; X86-SSE4-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE4-NEXT: pmuludq %xmm1, %xmm2
-; X86-SSE4-NEXT: movdqa %xmm0, %xmm3
-; X86-SSE4-NEXT: psrlq $32, %xmm3
-; X86-SSE4-NEXT: pmuludq %xmm1, %xmm3
-; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [4294967295,0,4294967295,0]
-; X86-SSE4-NEXT: paddq %xmm3, %xmm0
-; X86-SSE4-NEXT: psllq $32, %xmm0
+; X86-SSE4-NEXT: psrlq $32, %xmm2
+; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm3 = [18446744073709551601,18446744073709551553]
+; X86-SSE4-NEXT: pmuludq %xmm3, %xmm2
+; X86-SSE4-NEXT: paddq %xmm1, %xmm2
+; X86-SSE4-NEXT: psllq $32, %xmm2
+; X86-SSE4-NEXT: pmuludq %xmm3, %xmm0
; X86-SSE4-NEXT: paddq %xmm2, %xmm0
; X86-SSE4-NEXT: retl
;
@@ -1482,29 +1482,29 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
; X86-SSE2-LABEL: mul_v2i64_neg_17_65:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967279,4294967295,4294967231,4294967295]
+; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-SSE2-NEXT: pmuludq %xmm0, %xmm1
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT: psrlq $32, %xmm3
-; X86-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [4294967295,0,4294967295,0]
-; X86-SSE2-NEXT: paddq %xmm3, %xmm0
-; X86-SSE2-NEXT: psllq $32, %xmm0
+; X86-SSE2-NEXT: psrlq $32, %xmm2
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967279,4294967295,4294967231,4294967295]
+; X86-SSE2-NEXT: pmuludq %xmm3, %xmm2
+; X86-SSE2-NEXT: paddq %xmm1, %xmm2
+; X86-SSE2-NEXT: psllq $32, %xmm2
+; X86-SSE2-NEXT: pmuludq %xmm3, %xmm0
; X86-SSE2-NEXT: paddq %xmm2, %xmm0
; X86-SSE2-NEXT: retl
;
; X86-SSE4-LABEL: mul_v2i64_neg_17_65:
; X86-SSE4: # %bb.0:
-; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551]
+; X86-SSE4-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-SSE4-NEXT: pmuludq %xmm0, %xmm1
; X86-SSE4-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE4-NEXT: pmuludq %xmm1, %xmm2
-; X86-SSE4-NEXT: movdqa %xmm0, %xmm3
-; X86-SSE4-NEXT: psrlq $32, %xmm3
-; X86-SSE4-NEXT: pmuludq %xmm1, %xmm3
-; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [4294967295,0,4294967295,0]
-; X86-SSE4-NEXT: paddq %xmm3, %xmm0
-; X86-SSE4-NEXT: psllq $32, %xmm0
+; X86-SSE4-NEXT: psrlq $32, %xmm2
+; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm3 = [18446744073709551599,18446744073709551551]
+; X86-SSE4-NEXT: pmuludq %xmm3, %xmm2
+; X86-SSE4-NEXT: paddq %xmm1, %xmm2
+; X86-SSE4-NEXT: psllq $32, %xmm2
+; X86-SSE4-NEXT: pmuludq %xmm3, %xmm0
; X86-SSE4-NEXT: paddq %xmm2, %xmm0
; X86-SSE4-NEXT: retl
;
@@ -1600,7 +1600,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
; X86-SSE2-NEXT: psrlq $32, %xmm3
; X86-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,0,u,u,u,u,255,255,255,255,u,u,u,u]
; X86-SSE2-NEXT: paddq %xmm3, %xmm0
; X86-SSE2-NEXT: psllq $32, %xmm0
; X86-SSE2-NEXT: paddq %xmm2, %xmm0
@@ -1614,7 +1614,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
; X86-SSE4-NEXT: movdqa %xmm0, %xmm3
; X86-SSE4-NEXT: psrlq $32, %xmm3
; X86-SSE4-NEXT: pmuludq %xmm1, %xmm3
-; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
+; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,0,u,u,u,u,255,255,255,255,u,u,u,u]
; X86-SSE4-NEXT: paddq %xmm3, %xmm0
; X86-SSE4-NEXT: psllq $32, %xmm0
; X86-SSE4-NEXT: paddq %xmm2, %xmm0
@@ -1689,7 +1689,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
; X86-SSE2-NEXT: psrlq $32, %xmm3
; X86-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,0,u,u,u,u,255,255,255,255,u,u,u,u]
; X86-SSE2-NEXT: paddq %xmm3, %xmm0
; X86-SSE2-NEXT: psllq $32, %xmm0
; X86-SSE2-NEXT: paddq %xmm2, %xmm0
@@ -1703,7 +1703,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
; X86-SSE4-NEXT: movdqa %xmm0, %xmm3
; X86-SSE4-NEXT: psrlq $32, %xmm3
; X86-SSE4-NEXT: pmuludq %xmm1, %xmm3
-; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
+; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,0,u,u,u,u,255,255,255,255,u,u,u,u]
; X86-SSE4-NEXT: paddq %xmm3, %xmm0
; X86-SSE4-NEXT: psllq $32, %xmm0
; X86-SSE4-NEXT: paddq %xmm2, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index dd9594b11d96b..764b815f539ff 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -236,22 +236,21 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE2-LABEL: var_rotate_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: pslld $23, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT: paddd %xmm4, %xmm3
-; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
-; SSE2-NEXT: pslld $16, %xmm3
-; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $23, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT: paddd %xmm3, %xmm2
+; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
+; SSE2-NEXT: pslld $16, %xmm2
+; SSE2-NEXT: psrad $16, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; SSE2-NEXT: pslld $16, %xmm1
; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: packssdw %xmm3, %xmm1
+; SSE2-NEXT: packssdw %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pmulhuw %xmm1, %xmm2
; SSE2-NEXT: pmullw %xmm1, %xmm0
@@ -261,16 +260,15 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE41-LABEL: var_rotate_v8i16:
; SSE41: # %bb.0:
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
-; SSE41-NEXT: paddd %xmm2, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT: paddd %xmm3, %xmm1
; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pslld $23, %xmm3
-; SSE41-NEXT: paddd %xmm2, %xmm3
-; SSE41-NEXT: cvttps2dq %xmm3, %xmm2
+; SSE41-NEXT: pslld $23, %xmm2
+; SSE41-NEXT: paddd %xmm3, %xmm2
+; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
; SSE41-NEXT: packusdw %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmulhuw %xmm2, %xmm1
@@ -281,8 +279,7 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX1-LABEL: var_rotate_v8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
@@ -386,18 +383,17 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; X86-SSE2-LABEL: var_rotate_v8i16:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT: pxor %xmm3, %xmm3
; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
; X86-SSE2-NEXT: pslld $23, %xmm2
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; X86-SSE2-NEXT: paddd %xmm4, %xmm2
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; X86-SSE2-NEXT: paddd %xmm3, %xmm2
; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2
; X86-SSE2-NEXT: pslld $16, %xmm2
; X86-SSE2-NEXT: psrad $16, %xmm2
-; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; X86-SSE2-NEXT: pslld $23, %xmm1
-; X86-SSE2-NEXT: paddd %xmm4, %xmm1
+; X86-SSE2-NEXT: paddd %xmm3, %xmm1
; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; X86-SSE2-NEXT: pslld $16, %xmm1
; X86-SSE2-NEXT: psrad $16, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index a183b0c940805..c112538f26131 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -175,29 +175,28 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
-; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpmulhuw %xmm2, %xmm5, %xmm7
-; AVX1-NEXT: vpmullw %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpmulhuw %xmm2, %xmm4, %xmm6
+; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
-; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
index c1da6c094967a..9cb1a982519c7 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -157,45 +157,42 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE2-LABEL: var_shift_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: pslld $23, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT: paddd %xmm4, %xmm3
-; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
-; SSE2-NEXT: pslld $16, %xmm3
-; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $23, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT: paddd %xmm3, %xmm2
+; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
+; SSE2-NEXT: pslld $16, %xmm2
+; SSE2-NEXT: psrad $16, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; SSE2-NEXT: pslld $16, %xmm1
; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: packssdw %xmm3, %xmm1
+; SSE2-NEXT: packssdw %xmm2, %xmm1
; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v8i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
-; SSE41-NEXT: paddd %xmm2, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT: paddd %xmm3, %xmm1
; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pslld $23, %xmm3
-; SSE41-NEXT: paddd %xmm2, %xmm3
-; SSE41-NEXT: cvttps2dq %xmm3, %xmm2
+; SSE41-NEXT: pslld $23, %xmm2
+; SSE41-NEXT: paddd %xmm3, %xmm2
+; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
; SSE41-NEXT: packusdw %xmm1, %xmm2
; SSE41-NEXT: pmullw %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
@@ -259,18 +256,17 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; X86-SSE-LABEL: var_shift_v8i16:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pxor %xmm3, %xmm3
; X86-SSE-NEXT: movdqa %xmm1, %xmm2
-; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: pslld $23, %xmm2
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; X86-SSE-NEXT: paddd %xmm4, %xmm2
+; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; X86-SSE-NEXT: paddd %xmm3, %xmm2
; X86-SSE-NEXT: cvttps2dq %xmm2, %xmm2
; X86-SSE-NEXT: pslld $16, %xmm2
; X86-SSE-NEXT: psrad $16, %xmm2
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; X86-SSE-NEXT: pslld $23, %xmm1
-; X86-SSE-NEXT: paddd %xmm4, %xmm1
+; X86-SSE-NEXT: paddd %xmm3, %xmm1
; X86-SSE-NEXT: cvttps2dq %xmm1, %xmm1
; X86-SSE-NEXT: pslld $16, %xmm1
; X86-SSE-NEXT: psrad $16, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
index 986efd2943fc1..f9ccd1e8ca156 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -158,26 +158,25 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX1-LABEL: var_shift_v16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
-; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
-; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -248,20 +247,19 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; X86-AVX1-LABEL: var_shift_v16i16:
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; X86-AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; X86-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; X86-AVX1-NEXT: vpslld $23, %xmm2, %xmm5
+; X86-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4,4,5,5,6,6,7,7]
+; X86-AVX1-NEXT: vpslld $23, %xmm2, %xmm4
; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
-; X86-AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm5
-; X86-AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
+; X86-AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm4
+; X86-AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
; X86-AVX1-NEXT: vpslld $23, %xmm3, %xmm3
; X86-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm3
; X86-AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
-; X86-AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
-; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; X86-AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm3
-; X86-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; X86-AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4,4,5,5,6,6,7,7]
; X86-AVX1-NEXT: vpslld $23, %xmm4, %xmm4
; X86-AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm4
; X86-AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
index 3996d7f09f01c..d245bdca6ee29 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
@@ -93,45 +93,42 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
; SSE2-LABEL: var_shift_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: pslld $23, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT: paddd %xmm4, %xmm3
-; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
-; SSE2-NEXT: pslld $16, %xmm3
-; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $23, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT: paddd %xmm3, %xmm2
+; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
+; SSE2-NEXT: pslld $16, %xmm2
+; SSE2-NEXT: psrad $16, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; SSE2-NEXT: pslld $16, %xmm1
; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: packssdw %xmm3, %xmm1
+; SSE2-NEXT: packssdw %xmm2, %xmm1
; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
-; SSE41-NEXT: paddd %xmm2, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT: paddd %xmm3, %xmm1
; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pslld $23, %xmm3
-; SSE41-NEXT: paddd %xmm2, %xmm3
-; SSE41-NEXT: cvttps2dq %xmm3, %xmm2
+; SSE41-NEXT: pslld $23, %xmm2
+; SSE41-NEXT: paddd %xmm3, %xmm2
+; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
; SSE41-NEXT: packusdw %xmm1, %xmm2
; SSE41-NEXT: pmullw %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v4i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
@@ -195,18 +192,17 @@ define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
;
; X86-SSE-LABEL: var_shift_v4i16:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pxor %xmm3, %xmm3
; X86-SSE-NEXT: movdqa %xmm1, %xmm2
-; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: pslld $23, %xmm2
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; X86-SSE-NEXT: paddd %xmm4, %xmm2
+; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; X86-SSE-NEXT: paddd %xmm3, %xmm2
; X86-SSE-NEXT: cvttps2dq %xmm2, %xmm2
; X86-SSE-NEXT: pslld $16, %xmm2
; X86-SSE-NEXT: psrad $16, %xmm2
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; X86-SSE-NEXT: pslld $23, %xmm1
-; X86-SSE-NEXT: paddd %xmm4, %xmm1
+; X86-SSE-NEXT: paddd %xmm3, %xmm1
; X86-SSE-NEXT: cvttps2dq %xmm1, %xmm1
; X86-SSE-NEXT: pslld $16, %xmm1
; X86-SSE-NEXT: psrad $16, %xmm1
@@ -220,45 +216,42 @@ define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
; SSE2-LABEL: var_shift_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: pslld $23, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT: paddd %xmm4, %xmm3
-; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
-; SSE2-NEXT: pslld $16, %xmm3
-; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $23, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT: paddd %xmm3, %xmm2
+; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
+; SSE2-NEXT: pslld $16, %xmm2
+; SSE2-NEXT: psrad $16, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; SSE2-NEXT: pslld $16, %xmm1
; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: packssdw %xmm3, %xmm1
+; SSE2-NEXT: packssdw %xmm2, %xmm1
; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
-; SSE41-NEXT: paddd %xmm2, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT: paddd %xmm3, %xmm1
; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pslld $23, %xmm3
-; SSE41-NEXT: paddd %xmm2, %xmm3
-; SSE41-NEXT: cvttps2dq %xmm3, %xmm2
+; SSE41-NEXT: pslld $23, %xmm2
+; SSE41-NEXT: paddd %xmm3, %xmm2
+; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
; SSE41-NEXT: packusdw %xmm1, %xmm2
; SSE41-NEXT: pmullw %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v2i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
@@ -322,18 +315,17 @@ define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
;
; X86-SSE-LABEL: var_shift_v2i16:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pxor %xmm3, %xmm3
; X86-SSE-NEXT: movdqa %xmm1, %xmm2
-; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: pslld $23, %xmm2
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; X86-SSE-NEXT: paddd %xmm4, %xmm2
+; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; X86-SSE-NEXT: paddd %xmm3, %xmm2
; X86-SSE-NEXT: cvttps2dq %xmm2, %xmm2
; X86-SSE-NEXT: pslld $16, %xmm2
; X86-SSE-NEXT: psrad $16, %xmm2
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; X86-SSE-NEXT: pslld $23, %xmm1
-; X86-SSE-NEXT: paddd %xmm4, %xmm1
+; X86-SSE-NEXT: paddd %xmm3, %xmm1
; X86-SSE-NEXT: cvttps2dq %xmm1, %xmm1
; X86-SSE-NEXT: pslld $16, %xmm1
; X86-SSE-NEXT: psrad $16, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
index fb46f6205ab4e..3b93734c24deb 100644
--- a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
+++ b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
@@ -3176,10 +3176,11 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_32(<2 x i64> %
;
; X86-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_32:
; X86-AVX2: # %bb.0:
-; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X86-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967294,4294967294,4294967294,4294967294]
+; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
; X86-AVX2-NEXT: retl
;
; X64-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_32:
>From ead965945523e89a8cfe2b9ec3c6a6003e327bab Mon Sep 17 00:00:00 2001
From: Bjorn Pettersson <bjorn.a.pettersson at ericsson.com>
Date: Tue, 21 Oct 2025 15:33:48 +0200
Subject: [PATCH 2/4] [SelectionDAG] Add DoNotPoisonEltMask to
SimplifyDemandedVectorEltsForTargetNode
Add DoNotPoisonEltMask to SimplifyDemandedVectorEltsForTargetNode
and try to handle it for a number of X86 opcodes. In some situations
we just fallback and assume that the DoNotPoisonEltMask elements
are demanded.
Goal is to reduce amount of regressions after fix of #138513.
---
llvm/include/llvm/CodeGen/TargetLowering.h | 5 +-
.../CodeGen/SelectionDAG/TargetLowering.cpp | 7 +-
llvm/lib/Target/X86/X86ISelLowering.cpp | 176 ++--
llvm/lib/Target/X86/X86ISelLowering.h | 20 +-
.../X86/avx512-intrinsics-fast-isel.ll | 4 +-
.../test/CodeGen/X86/bitcast-and-setcc-128.ll | 28 +-
llvm/test/CodeGen/X86/bitcast-setcc-128.ll | 12 +-
llvm/test/CodeGen/X86/bitcast-vector-bool.ll | 4 -
.../CodeGen/X86/buildvec-widen-dotproduct.ll | 36 +-
llvm/test/CodeGen/X86/combine-multiplies.ll | 2 +-
llvm/test/CodeGen/X86/combine-pmuldq.ll | 23 +-
llvm/test/CodeGen/X86/combine-rotates.ll | 2 +-
llvm/test/CodeGen/X86/combine-sdiv.ll | 39 +-
llvm/test/CodeGen/X86/combine-shl.ll | 18 +-
llvm/test/CodeGen/X86/combine-sra.ll | 51 +-
.../test/CodeGen/X86/combine-storetomstore.ll | 4 +-
llvm/test/CodeGen/X86/combine-udiv.ll | 5 +-
llvm/test/CodeGen/X86/dagcombine-shifts.ll | 2 +-
.../CodeGen/X86/f16c-intrinsics-fast-isel.ll | 4 -
.../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 65 +-
llvm/test/CodeGen/X86/funnel-shift.ll | 4 +-
...ist-and-by-const-from-shl-in-eqcmp-zero.ll | 4 +-
llvm/test/CodeGen/X86/known-pow2.ll | 16 +-
llvm/test/CodeGen/X86/madd.ll | 2 +-
llvm/test/CodeGen/X86/masked_store.ll | 2 +-
llvm/test/CodeGen/X86/movmsk-cmp.ll | 10 -
llvm/test/CodeGen/X86/mulvi32.ll | 15 +-
...of-two-or-zero-when-comparing-with-zero.ll | 25 +-
llvm/test/CodeGen/X86/pmul.ll | 140 ++-
llvm/test/CodeGen/X86/pr35918.ll | 4 +-
llvm/test/CodeGen/X86/pr41619.ll | 2 -
llvm/test/CodeGen/X86/pr45563-2.ll | 2 +-
llvm/test/CodeGen/X86/pr45833.ll | 2 +-
llvm/test/CodeGen/X86/pr77459.ll | 2 +-
llvm/test/CodeGen/X86/psubus.ll | 237 +++--
llvm/test/CodeGen/X86/sadd_sat_vec.ll | 108 +--
llvm/test/CodeGen/X86/sat-add.ll | 16 +-
llvm/test/CodeGen/X86/sdiv-exact.ll | 30 +-
llvm/test/CodeGen/X86/shrink_vmul.ll | 2 -
.../CodeGen/X86/srem-seteq-vec-nonsplat.ll | 56 +-
llvm/test/CodeGen/X86/ssub_sat_vec.ll | 112 +--
llvm/test/CodeGen/X86/test-shrink-bug.ll | 2 +-
llvm/test/CodeGen/X86/udiv-exact.ll | 30 +-
llvm/test/CodeGen/X86/undo-mul-and.ll | 4 +-
.../CodeGen/X86/urem-seteq-vec-nonsplat.ll | 342 +++----
llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll | 2 +-
llvm/test/CodeGen/X86/vec_minmax_sint.ll | 84 +-
llvm/test/CodeGen/X86/vec_minmax_uint.ll | 84 +-
.../test/CodeGen/X86/vector-compare-all_of.ll | 36 +-
.../test/CodeGen/X86/vector-compare-any_of.ll | 36 +-
llvm/test/CodeGen/X86/vector-fshl-128.ll | 46 +-
llvm/test/CodeGen/X86/vector-fshl-rot-128.ll | 49 +-
llvm/test/CodeGen/X86/vector-fshl-rot-256.ll | 4 +-
.../CodeGen/X86/vector-fshl-rot-sub128.ll | 8 +-
llvm/test/CodeGen/X86/vector-fshl-sub128.ll | 4 +-
llvm/test/CodeGen/X86/vector-fshr-128.ll | 6 +-
llvm/test/CodeGen/X86/vector-fshr-rot-128.ll | 14 +-
llvm/test/CodeGen/X86/vector-fshr-rot-256.ll | 4 +-
.../CodeGen/X86/vector-fshr-rot-sub128.ll | 8 +-
.../vector-interleaved-store-i8-stride-8.ll | 436 +++++----
llvm/test/CodeGen/X86/vector-mul.ll | 16 +-
llvm/test/CodeGen/X86/vector-pcmp.ll | 3 +-
.../CodeGen/X86/vector-reduce-fmaximum.ll | 163 ++--
llvm/test/CodeGen/X86/vector-reduce-mul.ll | 170 ++--
llvm/test/CodeGen/X86/vector-reduce-smax.ll | 105 +--
llvm/test/CodeGen/X86/vector-reduce-smin.ll | 105 +--
llvm/test/CodeGen/X86/vector-reduce-umax.ll | 105 +--
llvm/test/CodeGen/X86/vector-reduce-umin.ll | 105 +--
llvm/test/CodeGen/X86/vector-rotate-128.ll | 49 +-
llvm/test/CodeGen/X86/vector-rotate-256.ll | 4 +-
llvm/test/CodeGen/X86/vector-shift-shl-128.ll | 4 +-
.../X86/vector-shuffle-combining-avx.ll | 12 +-
.../X86/vector-shuffle-combining-ssse3.ll | 11 +-
.../CodeGen/X86/vector-shuffle-combining.ll | 9 +-
llvm/test/CodeGen/X86/vector-trunc-math.ll | 8 +-
llvm/test/CodeGen/X86/vector-trunc-packus.ll | 604 ++++++------
llvm/test/CodeGen/X86/vector-trunc-ssat.ll | 870 ++++++++----------
llvm/test/CodeGen/X86/vector-trunc-usat.ll | 726 +++++++--------
llvm/test/CodeGen/X86/vselect.ll | 26 +-
79 files changed, 2538 insertions(+), 3054 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index c8ffdb3592e6b..e8e61623137d3 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4356,8 +4356,9 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
/// (used to simplify the caller). The KnownUndef/Zero elements may only be
/// accurate for those bits in the DemandedMask.
virtual bool SimplifyDemandedVectorEltsForTargetNode(
- SDValue Op, const APInt &DemandedElts, APInt &KnownUndef,
- APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth = 0) const;
+ SDValue Op, const APInt &DemandedElts, const APInt &DoNotPoisonEltMask,
+ APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO,
+ unsigned Depth = 0) const;
/// Attempt to simplify any target nodes based on the demanded bits/elts,
/// returning true on success. Otherwise, analyze the
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 6369bf023e1c0..cd361b4086090 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3912,7 +3912,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
default: {
if (Op.getOpcode() >= ISD::BUILTIN_OP_END) {
if (SimplifyDemandedVectorEltsForTargetNode(
- Op, DemandedElts | DoNotPoisonEltMask, KnownUndef, KnownZero, TLO,
+ Op, DemandedElts, DoNotPoisonEltMask, KnownUndef, KnownZero, TLO,
Depth))
return true;
} else {
@@ -4000,8 +4000,9 @@ unsigned TargetLowering::computeNumSignBitsForTargetInstr(
}
bool TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
- SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
- TargetLoweringOpt &TLO, unsigned Depth) const {
+ SDValue Op, const APInt &DemandedElts, const APInt &DoNotPoisonEltMask,
+ APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO,
+ unsigned Depth) const {
assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 05a854a0bf3fa..f3db2953e2b86 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43284,8 +43284,9 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
// Simplify variable target shuffle masks based on the demanded elements.
// TODO: Handle DemandedBits in mask indices as well?
bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
- SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
- TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
+ SDValue Op, const APInt &DemandedElts, const APInt &DoNotPoisonEltMask,
+ unsigned MaskIndex, TargetLowering::TargetLoweringOpt &TLO,
+ unsigned Depth) const {
// If we're demanding all elements don't bother trying to simplify the mask.
unsigned NumElts = DemandedElts.getBitWidth();
if (DemandedElts.isAllOnes())
@@ -43297,8 +43298,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
// Attempt to generically simplify the variable shuffle mask.
APInt MaskUndef, MaskZero;
- if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(Mask, DemandedElts, DoNotPoisonEltMask,
+ MaskUndef, MaskZero, TLO, Depth + 1))
return true;
// Attempt to extract+simplify a (constant pool load) shuffle mask.
@@ -43324,7 +43325,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
return false;
unsigned Scale = NumCstElts / NumElts;
- // Simplify mask if we have an undemanded element that is not undef.
+ // Simplify mask if we have an undemanded element that is not undef/posion.
bool Simplified = false;
SmallVector<Constant *, 32> ConstVecOps;
for (unsigned i = 0; i != NumCstElts; ++i) {
@@ -43351,8 +43352,9 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
}
bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
- SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
- TargetLoweringOpt &TLO, unsigned Depth) const {
+ SDValue Op, const APInt &DemandedElts, const APInt &DoNotPoisonEltMask,
+ APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO,
+ unsigned Depth) const {
int NumElts = DemandedElts.getBitWidth();
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
@@ -43365,11 +43367,11 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
APInt RHSUndef, RHSZero;
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
- if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(LHS, DemandedElts, DoNotPoisonEltMask,
+ LHSUndef, LHSZero, TLO, Depth + 1))
return true;
- if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(RHS, DemandedElts, DoNotPoisonEltMask,
+ RHSUndef, RHSZero, TLO, Depth + 1))
return true;
// Multiply by zero.
KnownZero = LHSZero | RHSZero;
@@ -43382,24 +43384,26 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
+ APInt DoNotPoisonSrcElts =
+ APIntOps::ScaleBitMask(DoNotPoisonEltMask, 2 * NumElts);
- if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, DoNotPoisonSrcElts,
+ LHSUndef, LHSZero, TLO, Depth + 1))
return true;
- if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, DoNotPoisonSrcElts,
+ RHSUndef, RHSZero, TLO, Depth + 1))
return true;
// TODO: Multiply by zero.
// If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
- if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, DoNotPoisonSrcElts,
+ LHSUndef, LHSZero, TLO, Depth + 1))
return true;
APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
- if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, DoNotPoisonSrcElts,
+ RHSUndef, RHSZero, TLO, Depth + 1))
return true;
break;
}
@@ -43414,7 +43418,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// Aggressively peek through ops to get at the demanded elts.
if (!DemandedElts.isAllOnes()) {
unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
- APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
+ APInt DemandedSrcElts =
+ APIntOps::ScaleBitMask(DemandedElts | DoNotPoisonEltMask, NumSrcElts);
SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts(
LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts(
@@ -43448,7 +43453,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
APInt AmtUndef, AmtZero;
unsigned NumAmtElts = AmtVT.getVectorNumElements();
APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
- if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
+ APInt DoNotPoisonAmtElts = APInt::getZero(NumAmtElts);
+ if (SimplifyDemandedVectorElts(Amt, AmtElts, DoNotPoisonAmtElts, AmtUndef, AmtZero, TLO,
Depth + 1, AssumeSingleUse))
return true;
[[fallthrough]];
@@ -43458,8 +43464,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
case X86ISD::VSRAI: {
SDValue Src = Op.getOperand(0);
APInt SrcUndef;
- if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(Src, DemandedElts, DoNotPoisonEltMask,
+ SrcUndef, KnownZero, TLO, Depth + 1))
return true;
// Fold shift(0,x) -> 0
@@ -43470,7 +43476,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// Aggressively peek through ops to get at the demanded elts.
if (!DemandedElts.isAllOnes())
if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
- Src, DemandedElts, TLO.DAG, Depth + 1))
+ Src, DemandedElts | DoNotPoisonEltMask, TLO.DAG, Depth + 1))
return TLO.CombineTo(
Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
break;
@@ -43484,8 +43490,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
APInt RHSUndef, RHSZero;
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
- if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(LHS, DemandedElts, DoNotPoisonEltMask,
+ LHSUndef, LHSZero, TLO, Depth + 1))
return true;
// Fold shift(0,x) -> 0
@@ -43493,8 +43499,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return TLO.CombineTo(
Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
- if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(RHS, DemandedElts, DoNotPoisonEltMask,
+ RHSUndef, RHSZero, TLO, Depth + 1))
return true;
KnownZero = LHSZero;
@@ -43528,11 +43534,11 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
APInt RHSUndef, RHSZero;
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
- if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(LHS, DemandedElts, DoNotPoisonEltMask,
+ LHSUndef, LHSZero, TLO, Depth + 1))
return true;
- if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(RHS, DemandedElts, DoNotPoisonEltMask,
+ RHSUndef, RHSZero, TLO, Depth + 1))
return true;
break;
}
@@ -43566,8 +43572,9 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
}
APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
- if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
- Depth + 1))
+ APInt DoNotPoisonSrcElts = DoNotPoisonEltMask.lshr(ShiftAmt);
+ if (SimplifyDemandedVectorElts(Src, DemandedSrc, DoNotPoisonSrcElts,
+ KnownUndef, KnownZero, TLO, Depth + 1))
return true;
KnownUndef <<= ShiftAmt;
@@ -43605,8 +43612,9 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
}
APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
- if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
- Depth + 1))
+ APInt DoNotPoisonSrcElts = DoNotPoisonEltMask.shl(ShiftAmt);
+ if (SimplifyDemandedVectorElts(Src, DemandedSrc, DoNotPoisonSrcElts,
+ KnownUndef, KnownZero, TLO, Depth + 1))
return true;
KnownUndef.lshrInPlace(ShiftAmt);
@@ -43654,17 +43662,17 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
APInt LHSUndef, LHSZero;
APInt RHSUndef, RHSZero;
- if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
+ if (SimplifyDemandedVectorElts(LHS, EltsLHS, DoNotPoisonEltMask, LHSUndef, LHSZero, TLO,
Depth + 1))
return true;
- if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
+ if (SimplifyDemandedVectorElts(RHS, EltsRHS, DoNotPoisonEltMask, RHSUndef, RHSZero, TLO,
Depth + 1))
return true;
if (!DemandedElts.isAllOnes()) {
- SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
+ SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS | DoNotPoisonEltMask,
TLO.DAG, Depth + 1);
- SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
+ SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS | DoNotPoisonEltMask,
TLO.DAG, Depth + 1);
if (NewLHS || NewRHS) {
NewLHS = NewLHS ? NewLHS : LHS;
@@ -43683,8 +43691,10 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
EVT SrcVT = Src.getValueType();
APInt SrcUndef, SrcZero;
APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
- if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
- Depth + 1))
+ APInt DoNotPoisonSrcElts =
+ DoNotPoisonEltMask.zextOrTrunc(SrcVT.getVectorNumElements());
+ if (SimplifyDemandedVectorElts(Src, SrcElts, DoNotPoisonSrcElts, SrcUndef,
+ SrcZero, TLO, Depth + 1))
return true;
break;
}
@@ -43695,14 +43705,16 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
APInt DemandedLHS, DemandedRHS;
getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
+ APInt DoNotPoisonLHS, DoNotPoisonRHS;
+ getPackDemandedElts(VT, DoNotPoisonEltMask, DoNotPoisonLHS, DoNotPoisonRHS);
APInt LHSUndef, LHSZero;
- if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(N0, DemandedLHS, DoNotPoisonLHS, LHSUndef,
+ LHSZero, TLO, Depth + 1))
return true;
APInt RHSUndef, RHSZero;
- if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(N1, DemandedRHS, DoNotPoisonRHS, RHSUndef,
+ RHSZero, TLO, Depth + 1))
return true;
// TODO - pass on known zero/undef.
@@ -43710,10 +43722,10 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// Aggressively peek through ops to get at the demanded elts.
// TODO - we should do this for all target/faux shuffles ops.
if (!DemandedElts.isAllOnes()) {
- SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
- TLO.DAG, Depth + 1);
- SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
- TLO.DAG, Depth + 1);
+ SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(
+ N0, DemandedLHS | DoNotPoisonLHS, TLO.DAG, Depth + 1);
+ SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(
+ N1, DemandedRHS | DoNotPoisonRHS, TLO.DAG, Depth + 1);
if (NewN0 || NewN1) {
NewN0 = NewN0 ? NewN0 : N0;
NewN1 = NewN1 ? NewN1 : N1;
@@ -43732,14 +43744,17 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
APInt DemandedLHS, DemandedRHS;
getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
+ APInt DoNotPoisonLHS, DoNotPoisonRHS;
+ getHorizDemandedElts(VT, DoNotPoisonEltMask, DoNotPoisonLHS,
+ DoNotPoisonRHS);
APInt LHSUndef, LHSZero;
- if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(N0, DemandedLHS, DoNotPoisonLHS, LHSUndef,
+ LHSZero, TLO, Depth + 1))
return true;
APInt RHSUndef, RHSZero;
- if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(N1, DemandedRHS, DoNotPoisonRHS, RHSUndef,
+ RHSZero, TLO, Depth + 1))
return true;
// TODO - pass on known zero/undef.
@@ -43747,10 +43762,10 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// Aggressively peek through ops to get at the demanded elts.
// TODO: Handle repeated operands.
if (N0 != N1 && !DemandedElts.isAllOnes()) {
- SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
- TLO.DAG, Depth + 1);
- SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
- TLO.DAG, Depth + 1);
+ SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(
+ N0, DemandedLHS | DoNotPoisonLHS, TLO.DAG, Depth + 1);
+ SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(
+ N1, DemandedRHS | DoNotPoisonRHS, TLO.DAG, Depth + 1);
if (NewN0 || NewN1) {
NewN0 = NewN0 ? NewN0 : N0;
NewN1 = NewN1 ? NewN1 : N1;
@@ -43766,9 +43781,11 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
+ APInt DoNotPoisonSrcElts =
+ DoNotPoisonEltMask.zextOrTrunc(SrcVT.getVectorNumElements());
APInt SrcUndef, SrcZero;
- if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
- Depth + 1))
+ if (SimplifyDemandedVectorElts(Src, DemandedSrc, DoNotPoisonSrcElts,
+ SrcUndef, SrcZero, TLO, Depth + 1))
return true;
KnownZero = SrcZero.zextOrTrunc(NumElts);
KnownUndef = SrcUndef.zextOrTrunc(NumElts);
@@ -43779,25 +43796,28 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
if (SDValue R = combineBlendOfPermutes(
VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
- DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))
+ DemandedElts | DoNotPoisonEltMask, TLO.DAG, Subtarget, SDLoc(Op)))
return TLO.CombineTo(Op, R);
break;
}
case X86ISD::BLENDV: {
APInt SelUndef, SelZero;
- if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
- SelZero, TLO, Depth + 1))
+ if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts,
+ DoNotPoisonEltMask, SelUndef, SelZero, TLO,
+ Depth + 1))
return true;
// TODO: Use SelZero to adjust LHS/RHS DemandedElts.
APInt LHSUndef, LHSZero;
- if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
- LHSZero, TLO, Depth + 1))
+ if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts,
+ DoNotPoisonEltMask, LHSUndef, LHSZero, TLO,
+ Depth + 1))
return true;
APInt RHSUndef, RHSZero;
- if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
- RHSZero, TLO, Depth + 1))
+ if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts,
+ DoNotPoisonEltMask, RHSUndef, RHSZero, TLO,
+ Depth + 1))
return true;
KnownZero = LHSZero & RHSZero;
@@ -43807,7 +43827,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
case X86ISD::VZEXT_MOVL: {
// If upper demanded elements are already zero then we have nothing to do.
SDValue Src = Op.getOperand(0);
- APInt DemandedUpperElts = DemandedElts;
+ APInt DemandedUpperElts = DemandedElts | DoNotPoisonEltMask;
DemandedUpperElts.clearLowBits(1);
if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
return TLO.CombineTo(Op, Src);
@@ -43843,7 +43863,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
break;
APInt SrcUndef, SrcZero;
APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
- if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
+ APInt DoNotPoisonSrcElts = APInt::getZero(SrcVT.getVectorNumElements());
+ if (SimplifyDemandedVectorElts(Src, SrcElts, DoNotPoisonSrcElts, SrcUndef, SrcZero, TLO,
Depth + 1))
return true;
// Aggressively peek through src to get at the demanded elt.
@@ -43854,21 +43875,21 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
break;
}
case X86ISD::VPERMV:
- if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
- Depth))
+ if (SimplifyDemandedVectorEltsForTargetShuffle(
+ Op, DemandedElts, DoNotPoisonEltMask, 0, TLO, Depth))
return true;
break;
case X86ISD::PSHUFB:
case X86ISD::VPERMV3:
case X86ISD::VPERMILPV:
- if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
- Depth))
+ if (SimplifyDemandedVectorEltsForTargetShuffle(
+ Op, DemandedElts, DoNotPoisonEltMask, 1, TLO, Depth))
return true;
break;
case X86ISD::VPPERM:
case X86ISD::VPERMIL2:
- if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
- Depth))
+ if (SimplifyDemandedVectorEltsForTargetShuffle(
+ Op, DemandedElts, DoNotPoisonEltMask, 2, TLO, Depth))
return true;
break;
}
@@ -44203,6 +44224,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return TLO.CombineTo(
Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
}
+ // FIXME: Do we need to consider DoNotPoisonEltMask?
for (int Src = 0; Src != NumSrcs; ++Src)
if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
@@ -44216,7 +44238,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
int Lo = Src * NumElts;
APInt SrcElts = APInt::getZero(NumElts);
for (int i = 0; i != NumElts; ++i)
- if (DemandedElts[i]) {
+ if (DemandedElts[i] || DoNotPoisonEltMask[i]) {
int M = OpMask[i] - Lo;
if (0 <= M && M < NumElts)
SrcElts.setBit(M);
@@ -44236,6 +44258,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// to match. This prevents combineX86ShuffleChain from returning a
// combined shuffle that's the same as the original root, causing an
// infinite loop.
+ // FIXME: Here we assume that combineX86ShufflesRecursively doesn't make
+ // undemanded vector elements more poisonous. No idea if that is true.
if (!DemandedElts.isAllOnes()) {
assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index b7151f65942b4..20e0b123c0539 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1307,18 +1307,14 @@ namespace llvm {
const SelectionDAG &DAG,
unsigned Depth) const override;
- bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op,
- const APInt &DemandedElts,
- APInt &KnownUndef,
- APInt &KnownZero,
- TargetLoweringOpt &TLO,
- unsigned Depth) const override;
-
- bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op,
- const APInt &DemandedElts,
- unsigned MaskIndex,
- TargetLoweringOpt &TLO,
- unsigned Depth) const;
+ bool SimplifyDemandedVectorEltsForTargetNode(
+ SDValue Op, const APInt &DemandedElts, const APInt &DoNotPoisonElts,
+ APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO,
+ unsigned Depth) const override;
+
+ bool SimplifyDemandedVectorEltsForTargetShuffle(
+ SDValue Op, const APInt &DemandedElts, const APInt &DoNotPoisonElts,
+ unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const;
bool SimplifyDemandedBitsForTargetNode(SDValue Op,
const APInt &DemandedBits,
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index eea4261ea30fe..30bf1a261f4b7 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -6625,7 +6625,7 @@ define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) {
; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
; X64-NEXT: vpsrlq $32, %xmm0, %xmm3
; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
@@ -6833,7 +6833,7 @@ define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) {
; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
; X64-NEXT: vpsrlq $32, %xmm0, %xmm3
; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll
index de030f1b78d3d..921cf88518562 100644
--- a/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll
+++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll
@@ -195,7 +195,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
; SSSE3-NEXT: pcmpgtb %xmm1, %xmm0
; SSSE3-NEXT: pcmpgtb %xmm3, %xmm2
; SSSE3-NEXT: pand %xmm0, %xmm2
-; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,0,u,u,u,u,u,u,u,1]
; SSSE3-NEXT: movmskpd %xmm2, %eax
; SSSE3-NEXT: # kill: def $al killed $al killed $eax
; SSSE3-NEXT: retq
@@ -342,25 +342,21 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm1
; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm0
; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm5
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1
; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm3
; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm2
; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm0
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-SSSE3-NEXT: pand %xmm0, %xmm3
+; SSE2-SSSE3-NEXT: por %xmm2, %xmm3
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm3
+; SSE2-SSSE3-NEXT: movmskpd %xmm3, %eax
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
; SSE2-SSSE3-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll
index d050a7f66104c..f21c3f7043e69 100644
--- a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll
+++ b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll
@@ -157,7 +157,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
; SSSE3-LABEL: v2i8:
; SSSE3: # %bb.0:
; SSSE3-NEXT: pcmpgtb %xmm1, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,0,u,u,u,u,u,u,u,1]
; SSSE3-NEXT: movmskpd %xmm0, %eax
; SSSE3-NEXT: # kill: def $al killed $al killed $eax
; SSSE3-NEXT: retq
@@ -272,12 +272,10 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1
; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0
; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
index 74ff87911d81d..9042f9e70ac1c 100644
--- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
+++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
@@ -634,11 +634,7 @@ define i1 @trunc_v32i8_cmp(<32 x i8> %a0) nounwind {
define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind {
; SSE-LABEL: bitcast_v8i64_to_v2i4:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE-NEXT: packssdw %xmm3, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: packssdw %xmm2, %xmm0
; SSE-NEXT: packsswb %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll b/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll
index 56957645dd9fc..deb30a597ca39 100644
--- a/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll
+++ b/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll
@@ -264,18 +264,16 @@ define i64 @dot_ext_v2i8_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind {
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: psrad $24, %xmm1
; SSE2-NEXT: pmuludq %xmm0, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT: pmuludq %xmm0, %xmm3
-; SSE2-NEXT: psllq $32, %xmm3
-; SSE2-NEXT: paddq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
-; SSE2-NEXT: paddq %xmm3, %xmm0
+; SSE2-NEXT: pmuludq %xmm0, %xmm2
+; SSE2-NEXT: psllq $32, %xmm2
+; SSE2-NEXT: paddq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT: paddq %xmm2, %xmm0
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: retq
;
@@ -460,18 +458,16 @@ define i64 @dot_ext_v2i32_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind {
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT: pmuludq %xmm1, %xmm3
-; SSE2-NEXT: psllq $32, %xmm3
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: paddq %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE2-NEXT: paddq %xmm0, %xmm1
-; SSE2-NEXT: movq %xmm1, %rax
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: psllq $32, %xmm2
+; SSE2-NEXT: pmuludq %xmm0, %xmm1
+; SSE2-NEXT: paddq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT: paddq %xmm1, %xmm0
+; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: retq
;
; SSE4-LABEL: dot_ext_v2i32_v2i64:
diff --git a/llvm/test/CodeGen/X86/combine-multiplies.ll b/llvm/test/CodeGen/X86/combine-multiplies.ll
index 202c94d7aacab..4bdf20da30636 100644
--- a/llvm/test/CodeGen/X86/combine-multiplies.ll
+++ b/llvm/test/CodeGen/X86/combine-multiplies.ll
@@ -144,7 +144,7 @@ define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind {
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [22,33,44,55]
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [33,33,55,55]
+; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [33,u,55,u]
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,726,1452,2420]
diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll
index da9e4a32f48a3..52675429627aa 100644
--- a/llvm/test/CodeGen/X86/combine-pmuldq.ll
+++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll
@@ -103,18 +103,15 @@ define <4 x i64> @combine_shuffle_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1)
define <8 x i64> @combine_zext_pmuludq_256(<8 x i32> %a) {
; SSE-LABEL: combine_zext_pmuludq_256:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm4
-; SSE-NEXT: pxor %xmm3, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,3,3]
; SSE-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
-; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,3,3]
; SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSE-NEXT: pmovsxdq {{.*#+}} xmm3 = [715827883,715827883]
-; SSE-NEXT: pmuludq %xmm3, %xmm0
-; SSE-NEXT: pmuludq %xmm3, %xmm4
-; SSE-NEXT: pmuludq %xmm3, %xmm2
-; SSE-NEXT: pmuludq %xmm1, %xmm3
-; SSE-NEXT: movdqa %xmm4, %xmm1
+; SSE-NEXT: pmovsxdq {{.*#+}} xmm4 = [715827883,715827883]
+; SSE-NEXT: pmuludq %xmm4, %xmm0
+; SSE-NEXT: pmuludq %xmm4, %xmm1
+; SSE-NEXT: pmuludq %xmm4, %xmm2
+; SSE-NEXT: pmuludq %xmm4, %xmm3
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_zext_pmuludq_256:
@@ -253,7 +250,7 @@ define i32 @PR43159(ptr %a0) {
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [344322273,344322273,1916962805,1916962805]
+; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [344322273,u,1916962805,u]
; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1645975491,344322273,2164392969,1916962805]
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
@@ -276,7 +273,7 @@ define i32 @PR43159(ptr %a0) {
; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [344322273,344322273,1916962805,1916962805]
+; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [344322273,u,1916962805,u]
; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1645975491,344322273,2164392969,1916962805]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
@@ -299,7 +296,7 @@ define i32 @PR43159(ptr %a0) {
; AVX512DQVL: # %bb.0: # %entry
; AVX512DQVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [344322273,344322273,1916962805,1916962805]
+; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [344322273,u,1916962805,u]
; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1645975491,344322273,2164392969,1916962805]
; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
diff --git a/llvm/test/CodeGen/X86/combine-rotates.ll b/llvm/test/CodeGen/X86/combine-rotates.ll
index 04370a05b523e..e7152ecedc57b 100644
--- a/llvm/test/CodeGen/X86/combine-rotates.ll
+++ b/llvm/test/CodeGen/X86/combine-rotates.ll
@@ -12,7 +12,7 @@ define <4 x i32> @combine_vec_rot_rot(<4 x i32> %x) {
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [524288,131072,32768,8192]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [131072,131072,8192,8192]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [131072,u,8192,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 634c63059924a..5186f726e3194 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -1538,7 +1538,7 @@ define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) {
define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) {
; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: psrlq $62, %xmm1
; SSE2-NEXT: paddq %xmm0, %xmm1
@@ -1552,7 +1552,7 @@ define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) {
;
; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: psrlq $62, %xmm1
; SSE41-NEXT: paddq %xmm0, %xmm1
@@ -1622,7 +1622,7 @@ define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) {
define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: psrlq $62, %xmm2
; SSE2-NEXT: paddq %xmm0, %xmm2
@@ -1650,7 +1650,7 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
;
; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psrad $31, %xmm2
; SSE41-NEXT: psrlq $62, %xmm2
; SSE41-NEXT: paddq %xmm0, %xmm2
@@ -1755,7 +1755,7 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: psrad $31, %xmm4
; SSE2-NEXT: psrlq $62, %xmm4
; SSE2-NEXT: paddq %xmm0, %xmm4
@@ -1765,7 +1765,7 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: psrad $31, %xmm4
; SSE2-NEXT: psrlq $62, %xmm4
; SSE2-NEXT: paddq %xmm2, %xmm4
@@ -1806,7 +1806,7 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
;
; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: psrad $31, %xmm4
; SSE41-NEXT: psrlq $62, %xmm4
; SSE41-NEXT: paddq %xmm0, %xmm4
@@ -1815,7 +1815,7 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
; SSE41-NEXT: psrlq $2, %xmm4
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE41-NEXT: movdqa %xmm2, %xmm4
; SSE41-NEXT: psrad $31, %xmm4
; SSE41-NEXT: psrlq $62, %xmm4
; SSE41-NEXT: paddq %xmm2, %xmm4
@@ -2185,13 +2185,12 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: pxor %xmm3, %xmm3
+; SSE41-NEXT: pcmpgtb %xmm1, %xmm3
; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,2,2,2,2,128,2,128]
; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm0
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; SSE41-NEXT: paddw %xmm2, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2],xmm0[3,4,5],xmm2[6],xmm0[7]
@@ -2222,15 +2221,15 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [256,2,2,2,2,128,2,128]
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5],xmm2[6],xmm1[7]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [256,2,2,2,2,128,2,128]
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5],xmm3[6],xmm2[7]
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll
index 4417666112452..9548967fd0592 100644
--- a/llvm/test/CodeGen/X86/combine-shl.ll
+++ b/llvm/test/CodeGen/X86/combine-shl.ll
@@ -88,7 +88,7 @@ define <4 x i32> @combine_vec_shl_known_zero1(<4 x i32> %x) {
; SSE2-NEXT: pmuludq %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,32768,8192,8192]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,u,8192,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: movdqa %xmm1, %xmm0
@@ -200,7 +200,7 @@ define <4 x i32> @combine_vec_shl_shl1(<4 x i32> %x) {
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,64,256,1024]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [64,64,1024,1024]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [64,u,1024,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
@@ -306,7 +306,7 @@ define <8 x i32> @combine_vec_shl_ext_shl2(<8 x i16> %x) {
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [131072,524288,2097152,8388608]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [524288,524288,8388608,8388608]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [524288,u,8388608,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
@@ -314,7 +314,7 @@ define <8 x i32> @combine_vec_shl_ext_shl2(<8 x i16> %x) {
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [33554432,134217728,536870912,2147483648]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [134217728,134217728,2147483648,2147483648]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [134217728,u,2147483648,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: movdqa %xmm2, %xmm0
@@ -675,7 +675,7 @@ define <4 x i32> @combine_vec_shl_add1(<4 x i32> %x) {
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,8,16]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4,4,16,16]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4,u,16,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -728,7 +728,7 @@ define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) {
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,8,16]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4,4,16,16]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4,u,16,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -789,7 +789,7 @@ define <4 x i32> @combine_vec_shl_mul1(<4 x i32> %x) {
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [10,24,56,128]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [24,24,128,128]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [24,u,128,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
@@ -815,7 +815,7 @@ define <4 x i32> @combine_vec_add_shl_nonsplat(<4 x i32> %a0) {
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,8,16,32]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [8,8,32,32]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [8,u,32,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -852,7 +852,7 @@ define <4 x i32> @combine_vec_add_shl_and_nonsplat(<4 x i32> %a0) {
; SSE2-NEXT: pmuludq %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [8,8,32,32]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [8,u,32,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll
index fa5c5ecded124..3b5b226dabd86 100644
--- a/llvm/test/CodeGen/X86/combine-sra.ll
+++ b/llvm/test/CodeGen/X86/combine-sra.ll
@@ -724,47 +724,46 @@ define <4 x i64> @combine_vec4i64_ashr_clamped(<4 x i64> %x, <4 x i64> %y) {
; SSE41-LABEL: combine_vec4i64_ashr_clamped:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm6, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147483711,2147483711,2147483711,2147483711]
-; SSE41-NEXT: movdqa %xmm7, %xmm8
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: movdqa %xmm3, %xmm6
+; SSE41-NEXT: pxor %xmm7, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm6
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483711,2147483711,2147483711,2147483711]
+; SSE41-NEXT: movdqa %xmm5, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm0
+; SSE41-NEXT: pand %xmm6, %xmm0
; SSE41-NEXT: movapd {{.*#+}} xmm8 = [63,63]
-; SSE41-NEXT: movapd %xmm8, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
+; SSE41-NEXT: movapd %xmm8, %xmm6
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6
; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm6, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: pxor %xmm7, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE41-NEXT: pand %xmm7, %xmm5
+; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psrlq %xmm8, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,3,2,3]
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: psrlq %xmm3, %xmm6
-; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm2[0,1,2,3],xmm6[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm5
+; SSE41-NEXT: psrlq %xmm3, %xmm5
+; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7]
; SSE41-NEXT: movdqa %xmm4, %xmm2
; SSE41-NEXT: psrlq %xmm8, %xmm2
; SSE41-NEXT: psrlq %xmm3, %xmm4
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm4[4,5,6,7]
-; SSE41-NEXT: pxor %xmm6, %xmm4
-; SSE41-NEXT: psubq %xmm6, %xmm4
+; SSE41-NEXT: pxor %xmm5, %xmm4
+; SSE41-NEXT: psubq %xmm5, %xmm4
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrlq %xmm5, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3]
+; SSE41-NEXT: psrlq %xmm6, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3]
; SSE41-NEXT: psrlq %xmm3, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlq %xmm5, %xmm2
+; SSE41-NEXT: psrlq %xmm6, %xmm2
; SSE41-NEXT: psrlq %xmm3, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: pxor %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/combine-storetomstore.ll b/llvm/test/CodeGen/X86/combine-storetomstore.ll
index 45a1172b2323e..c18c89dfdf684 100644
--- a/llvm/test/CodeGen/X86/combine-storetomstore.ll
+++ b/llvm/test/CodeGen/X86/combine-storetomstore.ll
@@ -39,7 +39,7 @@ define void @test_masked_store_success_v4i8(<4 x i8> %x, ptr %ptr, <4 x i1> %mas
define void @test_masked_store_success_v4i16(<4 x i16> %x, ptr %ptr, <4 x i1> %mask) {
; AVX-LABEL: test_masked_store_success_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,u,4,u,8,u,12,u,8,u,12,u,12,u,14,u]
; AVX-NEXT: vpsllw $15, %xmm1, %xmm1
; AVX-NEXT: vpsraw $15, %xmm1, %xmm1
; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
@@ -49,7 +49,7 @@ define void @test_masked_store_success_v4i16(<4 x i16> %x, ptr %ptr, <4 x i1> %m
;
; AVX2-LABEL: test_masked_store_success_v4i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,u,4,u,8,u,12,u,8,u,12,u,12,u,14,u]
; AVX2-NEXT: vpsllw $15, %xmm1, %xmm1
; AVX2-NEXT: vpsraw $15, %xmm1, %xmm1
; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll
index b25afb7fc4d3f..233735da6aae9 100644
--- a/llvm/test/CodeGen/X86/combine-udiv.ll
+++ b/llvm/test/CodeGen/X86/combine-udiv.ll
@@ -631,10 +631,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [171,0,0,0]
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: psrlw $7, %xmm0
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: psrlw $15, %xmm0
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/dagcombine-shifts.ll b/llvm/test/CodeGen/X86/dagcombine-shifts.ll
index e9a1e8ed728a4..19b9452e7117e 100644
--- a/llvm/test/CodeGen/X86/dagcombine-shifts.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-shifts.ll
@@ -439,7 +439,7 @@ define <4 x i32> @shift_zext_shl2_vec(<4 x i8> %x) nounwind {
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X64-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [512,256,128,64]
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,256,64,64]
+; X64-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,u,64,u]
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll
index c482ba0268f1c..1886e2911ede8 100644
--- a/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll
@@ -40,8 +40,6 @@ define i16 @test_cvtss_sh(float %a0) nounwind {
; X86-LABEL: test_cvtss_sh:
; X86: # %bb.0:
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X86-NEXT: vcvtps2ph $0, %xmm0, %xmm0
; X86-NEXT: vmovd %xmm0, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
@@ -49,8 +47,6 @@ define i16 @test_cvtss_sh(float %a0) nounwind {
;
; X64-LABEL: test_cvtss_sh:
; X64: # %bb.0:
-; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64-NEXT: vcvtps2ph $0, %xmm0, %xmm0
; X64-NEXT: vmovd %xmm0, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index f3d08c9d7f023..81529aff39ff1 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -141,58 +141,62 @@ declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>)
define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE-LABEL: fmul_pow2_8xhalf:
; CHECK-SSE: # %bb.0:
-; CHECK-SSE-NEXT: subq $120, %rsp
-; CHECK-SSE-NEXT: .cfi_def_cfa_offset 128
+; CHECK-SSE-NEXT: subq $104, %rsp
+; CHECK-SSE-NEXT: .cfi_def_cfa_offset 112
; CHECK-SSE-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; CHECK-SSE-NEXT: pslld $23, %xmm1
; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
; CHECK-SSE-NEXT: paddd %xmm2, %xmm1
; CHECK-SSE-NEXT: cvttps2dq %xmm1, %xmm1
+; CHECK-SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: pslld $16, %xmm1
; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; CHECK-SSE-NEXT: pslld $23, %xmm0
; CHECK-SSE-NEXT: paddd %xmm2, %xmm0
; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0
+; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: pslld $16, %xmm0
; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: psrld $16, %xmm0
-; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-SSE-NEXT: psrlq $48, %xmm0
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: cvtdq2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
-; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT: psrlq $48, %xmm0
+; CHECK-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
+; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; CHECK-SSE-NEXT: cvtdq2ps %xmm1, %xmm0
+; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
+; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-SSE-NEXT: psrld $16, %xmm0
-; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-SSE-NEXT: psrlq $48, %xmm0
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: cvtdq2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
+; CHECK-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT: psrlq $48, %xmm0
+; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-SSE-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
@@ -204,9 +208,9 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
-; CHECK-SSE-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
+; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
@@ -220,23 +224,23 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
-; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
+; CHECK-SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
-; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
-; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
+; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
@@ -250,12 +254,11 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; CHECK-SSE-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm1 = xmm1[0],mem[0]
-; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0
-; CHECK-SSE-NEXT: addq $120, %rsp
+; CHECK-SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0]
+; CHECK-SSE-NEXT: addq $104, %rsp
; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8
; CHECK-SSE-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index 42ac072ad3fb8..252cb3333f1d1 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -576,7 +576,7 @@ define <4 x i32> @fshl_v4i32_undef1_cst(<4 x i32> %a0) nounwind {
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [512,1024,2048,4096]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [1024,1024,4096,4096]
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [1024,u,4096,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE2-NEXT: retl
@@ -748,7 +748,7 @@ define <4 x i32> @fshr_v4i32_undef1_cst(<4 x i32> %a0) nounwind {
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [8388608,4194304,2097152,1048576]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [4194304,4194304,1048576,1048576]
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [4194304,u,1048576,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE2-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
index e7089bb8f5a06..b45d01e7b3653 100644
--- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
@@ -501,7 +501,7 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,1,16776960,2147483648]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [1,1,2147483648,2147483648]
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [1,u,2147483648,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X86-SSE2-NEXT: pand %xmm1, %xmm0
@@ -526,7 +526,7 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,1,16776960,2147483648]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,1,2147483648,2147483648]
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,u,2147483648,u]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X64-SSE2-NEXT: pand %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/known-pow2.ll b/llvm/test/CodeGen/X86/known-pow2.ll
index e192746c10692..019bca7e53b4c 100644
--- a/llvm/test/CodeGen/X86/known-pow2.ll
+++ b/llvm/test/CodeGen/X86/known-pow2.ll
@@ -28,16 +28,16 @@ define <4 x i32> @pow2_non_splat_vec_fail0(<4 x i32> %x) {
; CHECK-NEXT: pmuludq %xmm0, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1073741824,1073741824,67108864,67108864]
+; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1073741824,u,67108864,u]
; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
-; CHECK-NEXT: movdqa %xmm1, %xmm4
-; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-NEXT: psrld $1, %xmm1
-; CHECK-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3]
-; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [9,4,16,64]
-; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
+; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; CHECK-NEXT: movdqa %xmm1, %xmm3
+; CHECK-NEXT: psrld $1, %xmm3
+; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
+; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [9,4,16,64]
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [4,4,64,64]
+; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [4,u,64,u]
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-NEXT: psubd %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
index 7f4b5d64a341c..2a2a4a5ca18d3 100644
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -2057,7 +2057,7 @@ define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) {
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSE2-NEXT: psrad $16, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294934528,4294934528,0,0]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294934528,0,0,0]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,7,0,42,0,32,0]
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32768,4294934528,0,0]
diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll
index f4f7f44038153..6722d03d9b856 100644
--- a/llvm/test/CodeGen/X86/masked_store.ll
+++ b/llvm/test/CodeGen/X86/masked_store.ll
@@ -6178,7 +6178,7 @@ define void @undefshuffle(<8 x i1> %i0, ptr %src, ptr %dst) nounwind {
; AVX2-LABEL: undefshuffle:
; AVX2: ## %bb.0:
; AVX2-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u],zero,zero,ymm0[u,u],zero,zero,ymm0[u,u],zero,zero,ymm0[u,u],zero,zero,ymm0[u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,u,u,2,u,u,u,4,u,u,u,6,u,u,u],zero,ymm0[u,u,u],zero,ymm0[u,u,u],zero,ymm0[u,u,u],zero,ymm0[u,u,u]
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, (%rsi)
diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll
index 7f50cac5e4290..0b9029d47b24d 100644
--- a/llvm/test/CodeGen/X86/movmsk-cmp.ll
+++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll
@@ -824,11 +824,7 @@ define i1 @allzeros_v4i64_sign(<4 x i64> %arg) {
define i1 @allones_v8i64_sign(<8 x i64> %arg) {
; SSE-LABEL: allones_v8i64_sign:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE-NEXT: packssdw %xmm3, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: packssdw %xmm2, %xmm0
; SSE-NEXT: packsswb %xmm0, %xmm0
@@ -2000,7 +1996,6 @@ define i1 @allones_v2i64_and1(<2 x i64> %arg) {
; SSE-LABEL: allones_v2i64_and1:
; SSE: # %bb.0:
; SSE-NEXT: psllq $63, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE-NEXT: movmskpd %xmm0, %eax
; SSE-NEXT: cmpl $3, %eax
; SSE-NEXT: sete %al
@@ -3214,7 +3209,6 @@ define i1 @allones_v2i64_and4(<2 x i64> %arg) {
; SSE-LABEL: allones_v2i64_and4:
; SSE: # %bb.0:
; SSE-NEXT: psllq $61, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE-NEXT: movmskpd %xmm0, %eax
; SSE-NEXT: cmpl $3, %eax
; SSE-NEXT: sete %al
@@ -3399,14 +3393,10 @@ define i1 @allones_v8i64_and4(<8 x i64> %arg) {
; SSE-LABEL: allones_v8i64_and4:
; SSE: # %bb.0:
; SSE-NEXT: psllq $61, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE-NEXT: psllq $61, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE-NEXT: packssdw %xmm3, %xmm2
; SSE-NEXT: psllq $61, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE-NEXT: psllq $61, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: packssdw %xmm2, %xmm0
; SSE-NEXT: packsswb %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/mulvi32.ll b/llvm/test/CodeGen/X86/mulvi32.ll
index e5b0b11204e85..bbda4d68bb685 100644
--- a/llvm/test/CodeGen/X86/mulvi32.ll
+++ b/llvm/test/CodeGen/X86/mulvi32.ll
@@ -145,14 +145,13 @@ define <4 x i64> @_mul4xi32toi64a(<4 x i32>, <4 x i32>) {
;
; SSE42-LABEL: _mul4xi32toi64a:
; SSE42: # %bb.0:
-; SSE42-NEXT: pxor %xmm3, %xmm3
-; SSE42-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
-; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
-; SSE42-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE42-NEXT: pmuludq %xmm0, %xmm1
-; SSE42-NEXT: pmuludq %xmm4, %xmm2
-; SSE42-NEXT: movdqa %xmm2, %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,3,3]
+; SSE42-NEXT: pmuludq %xmm3, %xmm2
+; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; SSE42-NEXT: pmuludq %xmm1, %xmm0
+; SSE42-NEXT: movdqa %xmm2, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: _mul4xi32toi64a:
diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
index 53dbeccbb0e26..9729fd7a9d755 100644
--- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
+++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
@@ -127,21 +127,18 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) {
; SSE2-LABEL: p5_vector_urem_by_const__nonsplat:
; SSE2: # %bb.0:
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,954437177]
-; SSE2-NEXT: pmuludq %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,u,954437177,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,954437177,954437177]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,2147483648,u]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: psrlq $32, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,3435973837,2863311531,954437177]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,2147483648,u]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: psrlq $32, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index b43e3679bc4f5..00731fe3e9556 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -1007,15 +1007,13 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) {
;
; SSE41-LABEL: mul_v4i64_zero_upper:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
-; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE41-NEXT: pmuludq %xmm0, %xmm1
-; SSE41-NEXT: pmuludq %xmm4, %xmm2
-; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3]
-; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT: pmuludq %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
+; SSE41-NEXT: pmuludq %xmm3, %xmm1
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: mul_v4i64_zero_upper:
@@ -1164,23 +1162,20 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
;
; SSE41-LABEL: mul_v8i64_zero_upper:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pxor %xmm6, %xmm6
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm8 = xmm1[0],zero,xmm1[1],zero
-; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
-; SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; SSE41-NEXT: pmuludq %xmm0, %xmm2
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero
-; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3]
-; SSE41-NEXT: pmuludq %xmm1, %xmm3
-; SSE41-NEXT: pmuludq %xmm7, %xmm4
-; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3]
-; SSE41-NEXT: pmuludq %xmm8, %xmm5
-; SSE41-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm3[1,3]
-; SSE41-NEXT: movaps %xmm4, %xmm0
-; SSE41-NEXT: movaps %xmm5, %xmm1
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,1,3,3]
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,1,3,3]
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
+; SSE41-NEXT: pmuludq %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
+; SSE41-NEXT: pmuludq %xmm5, %xmm1
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero
+; SSE41-NEXT: pmuludq %xmm6, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,3,3]
+; SSE41-NEXT: pmuludq %xmm7, %xmm2
+; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
; SSE41-NEXT: retq
;
; AVX2-LABEL: mul_v8i64_zero_upper:
@@ -1219,25 +1214,25 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
; SSE2-LABEL: mul_v8i64_sext:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
-; SSE2-NEXT: psrad $16, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT: psrad $16, %xmm6
+; SSE2-NEXT: pxor %xmm12, %xmm12
; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm6
-; SSE2-NEXT: movdqa %xmm8, %xmm5
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT: movdqa %xmm6, %xmm5
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pxor %xmm12, %xmm12
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm12
-; SSE2-NEXT: movdqa %xmm0, %xmm11
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
+; SSE2-NEXT: pxor %xmm11, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm11
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm9
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1]
+; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm8
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
; SSE2-NEXT: pxor %xmm10, %xmm10
; SSE2-NEXT: pcmpgtd %xmm2, %xmm10
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1]
@@ -1245,43 +1240,40 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
; SSE2-NEXT: pxor %xmm13, %xmm13
; SSE2-NEXT: pcmpgtd %xmm1, %xmm13
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1]
-; SSE2-NEXT: pxor %xmm14, %xmm14
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm14
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1]
-; SSE2-NEXT: movdqa %xmm12, %xmm15
-; SSE2-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1]
-; SSE2-NEXT: pmuludq %xmm4, %xmm15
-; SSE2-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1]
-; SSE2-NEXT: pmuludq %xmm0, %xmm14
-; SSE2-NEXT: paddq %xmm15, %xmm14
-; SSE2-NEXT: psllq $32, %xmm14
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm12
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm11[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm4, %xmm14
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm0, %xmm12
+; SSE2-NEXT: paddq %xmm14, %xmm12
+; SSE2-NEXT: psllq $32, %xmm12
; SSE2-NEXT: pmuludq %xmm4, %xmm0
-; SSE2-NEXT: paddq %xmm14, %xmm0
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm7[2],xmm12[3],xmm7[3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm12
-; SSE2-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1]
-; SSE2-NEXT: pmuludq %xmm11, %xmm13
-; SSE2-NEXT: paddq %xmm12, %xmm13
-; SSE2-NEXT: psllq $32, %xmm13
-; SSE2-NEXT: pmuludq %xmm11, %xmm1
-; SSE2-NEXT: paddq %xmm13, %xmm1
-; SSE2-NEXT: movdqa %xmm6, %xmm4
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
+; SSE2-NEXT: paddq %xmm12, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm13[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm9, %xmm11
+; SSE2-NEXT: paddq %xmm4, %xmm11
+; SSE2-NEXT: psllq $32, %xmm11
+; SSE2-NEXT: pmuludq %xmm9, %xmm1
+; SSE2-NEXT: paddq %xmm11, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,1,1,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm4
-; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1]
-; SSE2-NEXT: pmuludq %xmm8, %xmm10
-; SSE2-NEXT: paddq %xmm4, %xmm10
-; SSE2-NEXT: psllq $32, %xmm10
-; SSE2-NEXT: pmuludq %xmm8, %xmm2
-; SSE2-NEXT: paddq %xmm10, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; SSE2-NEXT: pmuludq %xmm3, %xmm6
-; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
-; SSE2-NEXT: pmuludq %xmm5, %xmm9
-; SSE2-NEXT: paddq %xmm6, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm6, %xmm9
+; SSE2-NEXT: paddq %xmm4, %xmm9
; SSE2-NEXT: psllq $32, %xmm9
+; SSE2-NEXT: pmuludq %xmm6, %xmm2
+; SSE2-NEXT: paddq %xmm9, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm5, %xmm6
+; SSE2-NEXT: paddq %xmm4, %xmm6
+; SSE2-NEXT: psllq $32, %xmm6
; SSE2-NEXT: pmuludq %xmm5, %xmm3
-; SSE2-NEXT: paddq %xmm9, %xmm3
+; SSE2-NEXT: paddq %xmm6, %xmm3
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v8i64_sext:
diff --git a/llvm/test/CodeGen/X86/pr35918.ll b/llvm/test/CodeGen/X86/pr35918.ll
index f57fab3084a9e..7e63b0abfae69 100644
--- a/llvm/test/CodeGen/X86/pr35918.ll
+++ b/llvm/test/CodeGen/X86/pr35918.ll
@@ -13,7 +13,7 @@ define void @fetch_r16g16_snorm_unorm8(ptr, ptr, i32, i32, ptr) nounwind {
; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
; X86-NEXT: vpsrlw $7, %xmm0, %xmm0
-; X86-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; X86-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u]
; X86-NEXT: vmovd %xmm0, %ecx
; X86-NEXT: orl $-16777216, %ecx # imm = 0xFF000000
; X86-NEXT: movl %ecx, (%eax)
@@ -25,7 +25,7 @@ define void @fetch_r16g16_snorm_unorm8(ptr, ptr, i32, i32, ptr) nounwind {
; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
; X64-NEXT: vpsrlw $7, %xmm0, %xmm0
-; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u]
; X64-NEXT: vmovd %xmm0, %eax
; X64-NEXT: orl $-16777216, %eax # imm = 0xFF000000
; X64-NEXT: movl %eax, (%rdi)
diff --git a/llvm/test/CodeGen/X86/pr41619.ll b/llvm/test/CodeGen/X86/pr41619.ll
index 5d11f1c960a8c..ad778b4970cbf 100644
--- a/llvm/test/CodeGen/X86/pr41619.ll
+++ b/llvm/test/CodeGen/X86/pr41619.ll
@@ -5,8 +5,6 @@
define void @foo(double %arg) {
; CHECK-LABEL: foo:
; CHECK: ## %bb.0: ## %bb
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; CHECK-NEXT: vmovq %xmm0, %rax
; CHECK-NEXT: movl %eax, (%rax)
; CHECK-NEXT: movq $0, (%rax)
diff --git a/llvm/test/CodeGen/X86/pr45563-2.ll b/llvm/test/CodeGen/X86/pr45563-2.ll
index 00430c835837f..72877e1b1d67d 100644
--- a/llvm/test/CodeGen/X86/pr45563-2.ll
+++ b/llvm/test/CodeGen/X86/pr45563-2.ll
@@ -39,7 +39,7 @@ define <9 x float> @mload_split9(<9 x i1> %mask, ptr %addr, <9 x float> %dst) {
; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm4
; CHECK-NEXT: vblendvps %ymm1, %ymm4, %ymm0, %ymm0
-; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,u,u,u],zero,xmm2[u,u,u],zero,xmm2[u,u,u],zero,xmm2[u,u,u]
; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm1, %ymm2
; CHECK-NEXT: vmovaps %ymm0, (%rdi)
diff --git a/llvm/test/CodeGen/X86/pr45833.ll b/llvm/test/CodeGen/X86/pr45833.ll
index aa6a38a841f43..04c342b6673ed 100644
--- a/llvm/test/CodeGen/X86/pr45833.ll
+++ b/llvm/test/CodeGen/X86/pr45833.ll
@@ -29,7 +29,7 @@ define void @mstore_split9(<9 x float> %value, ptr %addr, <9 x i1> %mask) {
; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3
; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3
; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3
-; CHECK-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[8,u,u,u],zero,xmm3[u,u,u],zero,xmm3[u,u,u],zero,xmm3[u,u,u]
; CHECK-NEXT: vpslld $31, %xmm4, %xmm4
; CHECK-NEXT: vmaskmovps %ymm1, %ymm4, 32(%rdi)
; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
diff --git a/llvm/test/CodeGen/X86/pr77459.ll b/llvm/test/CodeGen/X86/pr77459.ll
index 602f82b254a1e..9c072e6f5e3fc 100644
--- a/llvm/test/CodeGen/X86/pr77459.ll
+++ b/llvm/test/CodeGen/X86/pr77459.ll
@@ -80,7 +80,7 @@ define i8 @reverse_cmp_v8i1(<8 x i16> %a0, <8 x i16> %a1) {
; SSE42-LABEL: reverse_cmp_v8i1:
; SSE42: # %bb.0:
; SSE42-NEXT: pcmpeqw %xmm1, %xmm0
-; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
+; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,15,u,13,u,11,u,9,u,7,u,5,u,3,u,1]
; SSE42-NEXT: packsswb %xmm0, %xmm0
; SSE42-NEXT: pmovmskb %xmm0, %eax
; SSE42-NEXT: # kill: def $al killed $al killed $eax
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index b93862be0a1b4..47a6473a556fd 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -1670,51 +1670,48 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
; SSE41-LABEL: psubus_8i64_max:
; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm6, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [2147549183,2147549183,2147549183,2147549183]
-; SSE41-NEXT: movdqa %xmm8, %xmm9
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pand %xmm9, %xmm0
-; SSE41-NEXT: movapd {{.*#+}} xmm7 = [65535,65535]
-; SSE41-NEXT: movapd %xmm7, %xmm9
+; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: movdqa %xmm4, %xmm8
+; SSE41-NEXT: pxor %xmm7, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm8
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
+; SSE41-NEXT: movdqa %xmm6, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: movapd {{.*#+}} xmm8 = [65535,65535]
+; SSE41-NEXT: movapd %xmm8, %xmm9
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm6, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa %xmm8, %xmm10
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pand %xmm10, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm4
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: pxor %xmm7, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm4
+; SSE41-NEXT: movdqa %xmm6, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm10, %xmm0
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: movapd %xmm8, %xmm4
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4
; SSE41-NEXT: packusdw %xmm9, %xmm4
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm6, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa %xmm8, %xmm9
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pand %xmm9, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: pxor %xmm7, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm3
+; SSE41-NEXT: movdqa %xmm6, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: movapd %xmm8, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm6, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pand %xmm8, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7
-; SSE41-NEXT: packusdw %xmm3, %xmm7
-; SSE41-NEXT: packusdw %xmm4, %xmm7
-; SSE41-NEXT: psubusw %xmm7, %xmm5
+; SSE41-NEXT: pxor %xmm7, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT: pand %xmm7, %xmm6
+; SSE41-NEXT: movdqa %xmm6, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8
+; SSE41-NEXT: packusdw %xmm3, %xmm8
+; SSE41-NEXT: packusdw %xmm4, %xmm8
+; SSE41-NEXT: psubusw %xmm8, %xmm5
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: retq
;
@@ -2769,55 +2766,52 @@ define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) {
;
; SSE41-LABEL: test33:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647,2147483647,2147483647]
-; SSE41-NEXT: movdqa %xmm9, %xmm10
-; SSE41-NEXT: pcmpgtd %xmm8, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm0
-; SSE41-NEXT: pand %xmm10, %xmm0
-; SSE41-NEXT: movapd {{.*#+}} xmm8 = [4294967295,4294967295]
-; SSE41-NEXT: movapd %xmm8, %xmm10
+; SSE41-NEXT: movdqa %xmm0, %xmm7
+; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: movdqa %xmm3, %xmm9
+; SSE41-NEXT: pxor %xmm8, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm9
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647,2147483647,2147483647]
+; SSE41-NEXT: movdqa %xmm6, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm10, %xmm0
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295]
+; SSE41-NEXT: movapd %xmm9, %xmm10
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm10
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa %xmm9, %xmm11
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm0
-; SSE41-NEXT: pand %xmm11, %xmm0
-; SSE41-NEXT: movapd %xmm8, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: pxor %xmm8, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm3
+; SSE41-NEXT: movdqa %xmm6, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm11, %xmm0
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: movapd %xmm9, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2]
-; SSE41-NEXT: pmaxud %xmm3, %xmm6
-; SSE41-NEXT: psubd %xmm3, %xmm6
-; SSE41-NEXT: movdqa %xmm5, %xmm0
-; SSE41-NEXT: pxor %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa %xmm9, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm0
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm8, %xmm2
+; SSE41-NEXT: pmaxud %xmm3, %xmm7
+; SSE41-NEXT: psubd %xmm3, %xmm7
+; SSE41-NEXT: movdqa %xmm5, %xmm2
+; SSE41-NEXT: pxor %xmm8, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm2
+; SSE41-NEXT: movdqa %xmm6, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm0
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: movapd %xmm9, %xmm2
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm0
-; SSE41-NEXT: pand %xmm9, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8
-; SSE41-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm2[0,2]
-; SSE41-NEXT: pmaxud %xmm8, %xmm1
-; SSE41-NEXT: psubd %xmm8, %xmm1
+; SSE41-NEXT: pxor %xmm8, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT: pand %xmm8, %xmm6
; SSE41-NEXT: movdqa %xmm6, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9
+; SSE41-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2]
+; SSE41-NEXT: pmaxud %xmm9, %xmm1
+; SSE41-NEXT: psubd %xmm9, %xmm1
+; SSE41-NEXT: movdqa %xmm7, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test33:
@@ -2958,53 +2952,50 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) {
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,1,1,1]
; SSE41-NEXT: pand %xmm0, %xmm1
; SSE41-NEXT: pand %xmm0, %xmm6
-; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647,2147483647,2147483647]
-; SSE41-NEXT: movdqa %xmm9, %xmm10
-; SSE41-NEXT: pcmpgtd %xmm8, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm0
-; SSE41-NEXT: pand %xmm10, %xmm0
-; SSE41-NEXT: movapd {{.*#+}} xmm8 = [4294967295,4294967295]
-; SSE41-NEXT: movapd %xmm8, %xmm10
+; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: movdqa %xmm3, %xmm9
+; SSE41-NEXT: pxor %xmm8, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm9
+; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647]
+; SSE41-NEXT: movdqa %xmm7, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm10, %xmm0
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295]
+; SSE41-NEXT: movapd %xmm9, %xmm10
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm10
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa %xmm9, %xmm11
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm0
-; SSE41-NEXT: pand %xmm11, %xmm0
-; SSE41-NEXT: movapd %xmm8, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: pxor %xmm8, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm3
+; SSE41-NEXT: movdqa %xmm7, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm11, %xmm0
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: movapd %xmm9, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2]
; SSE41-NEXT: pmaxud %xmm3, %xmm6
; SSE41-NEXT: psubd %xmm3, %xmm6
-; SSE41-NEXT: movdqa %xmm5, %xmm0
-; SSE41-NEXT: pxor %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa %xmm9, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm0
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm8, %xmm2
+; SSE41-NEXT: movdqa %xmm5, %xmm2
+; SSE41-NEXT: pxor %xmm8, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm2
+; SSE41-NEXT: movdqa %xmm7, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm0
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: movapd %xmm9, %xmm2
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm0
-; SSE41-NEXT: pand %xmm9, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8
-; SSE41-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm2[0,2]
-; SSE41-NEXT: pmaxud %xmm8, %xmm1
-; SSE41-NEXT: psubd %xmm8, %xmm1
+; SSE41-NEXT: pxor %xmm8, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
+; SSE41-NEXT: pand %xmm8, %xmm7
+; SSE41-NEXT: movdqa %xmm7, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9
+; SSE41-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2]
+; SSE41-NEXT: pmaxud %xmm9, %xmm1
+; SSE41-NEXT: psubd %xmm9, %xmm1
; SSE41-NEXT: movdqa %xmm6, %xmm0
; SSE41-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
index a4c94ec2fa883..3fe63c922f419 100644
--- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
@@ -1180,17 +1180,15 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; SSE41-NEXT: paddq %xmm1, %xmm2
; SSE41-NEXT: pxor %xmm2, %xmm0
; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm0, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pxor %xmm3, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT: pxor %xmm0, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
@@ -1361,20 +1359,18 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE41-NEXT: movdqa %xmm4, %xmm6
; SSE41-NEXT: pxor %xmm5, %xmm6
; SSE41-NEXT: movdqa %xmm0, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: pand %xmm8, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm6
; SSE41-NEXT: por %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE41-NEXT: pxor %xmm6, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807]
-; SSE41-NEXT: movapd {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT: movapd %xmm6, %xmm8
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8
+; SSE41-NEXT: movapd {{.*#+}} xmm6 = [9223372036854775807,9223372036854775807]
+; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT: movapd %xmm7, %xmm8
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -1382,19 +1378,17 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE41-NEXT: paddq %xmm3, %xmm1
; SSE41-NEXT: pxor %xmm1, %xmm5
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT: pand %xmm8, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm2, %xmm5
; SSE41-NEXT: por %xmm0, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
; SSE41-NEXT: pxor %xmm5, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm6
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7
; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
; SSE41-NEXT: movapd %xmm4, %xmm0
; SSE41-NEXT: retq
;
@@ -1665,19 +1659,17 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE41-NEXT: movdqa %xmm8, %xmm10
; SSE41-NEXT: pxor %xmm9, %xmm10
; SSE41-NEXT: movdqa %xmm0, %xmm11
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: pand %xmm12, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm10, %xmm11
+; SSE41-NEXT: pcmpgtd %xmm10, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm11, %xmm10
; SSE41-NEXT: por %xmm0, %xmm10
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE41-NEXT: pxor %xmm10, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807]
; SSE41-NEXT: movapd {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
; SSE41-NEXT: movapd %xmm10, %xmm12
+; SSE41-NEXT: movdqa %xmm8, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm8
@@ -1687,17 +1679,15 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE41-NEXT: movdqa %xmm1, %xmm4
; SSE41-NEXT: pxor %xmm9, %xmm4
; SSE41-NEXT: movdqa %xmm0, %xmm12
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm12
-; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: pand %xmm13, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
-; SSE41-NEXT: por %xmm0, %xmm12
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm12
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm12, %xmm13
+; SSE41-NEXT: por %xmm0, %xmm13
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE41-NEXT: pxor %xmm12, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pxor %xmm13, %xmm4
; SSE41-NEXT: movapd %xmm10, %xmm5
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
@@ -1707,17 +1697,15 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE41-NEXT: movdqa %xmm2, %xmm4
; SSE41-NEXT: pxor %xmm9, %xmm4
; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: pand %xmm12, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm0, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm12
+; SSE41-NEXT: por %xmm0, %xmm12
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSE41-NEXT: pxor %xmm5, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT: pxor %xmm12, %xmm4
; SSE41-NEXT: movapd %xmm10, %xmm5
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
@@ -1726,16 +1714,14 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE41-NEXT: paddq %xmm7, %xmm3
; SSE41-NEXT: pxor %xmm3, %xmm9
; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm9, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm5
; SSE41-NEXT: por %xmm0, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
; SSE41-NEXT: pxor %xmm5, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm10
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3
diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll
index 69e6ff7770ebe..3b4e9e8b1b2f0 100644
--- a/llvm/test/CodeGen/X86/sat-add.ll
+++ b/llvm/test/CodeGen/X86/sat-add.ll
@@ -633,11 +633,10 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_min(<2 x i64> %x) {
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372034707292117,9223372034707292117]
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
-; SSE41-NEXT: pand %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm0
@@ -1167,10 +1166,9 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_min(<2 x i64> %x, <2 x i64>
; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
; SSE41-NEXT: paddq %xmm1, %xmm3
; SSE41-NEXT: movdqa %xmm3, %xmm0
diff --git a/llvm/test/CodeGen/X86/sdiv-exact.ll b/llvm/test/CodeGen/X86/sdiv-exact.ll
index 2684f464be3a4..fe4ac52535d3b 100644
--- a/llvm/test/CodeGen/X86/sdiv-exact.ll
+++ b/llvm/test/CodeGen/X86/sdiv-exact.ll
@@ -83,12 +83,11 @@ define <4 x i32> @test5(<4 x i32> %x) {
; X86-NEXT: movdqa %xmm0, %xmm1
; X86-NEXT: psrad $3, %xmm1
; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
-; X86-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,3264175145,3264175145]
-; X86-NEXT: movaps %xmm1, %xmm0
-; X86-NEXT: pmuludq %xmm2, %xmm0
+; X86-NEXT: movdqa {{.*#+}} xmm0 = [2863311531,2863311531,3264175145,3264175145]
+; X86-NEXT: pmuludq %xmm1, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-NEXT: pmuludq %xmm2, %xmm1
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
@@ -109,12 +108,11 @@ define <4 x i32> @test6(<4 x i32> %x) {
; X86-NEXT: psrad $3, %xmm1
; X86-NEXT: psrad $1, %xmm0
; X86-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X86-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,3303820997,3303820997]
-; X86-NEXT: movapd %xmm0, %xmm1
-; X86-NEXT: pmuludq %xmm2, %xmm1
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,3303820997,3303820997]
+; X86-NEXT: pmuludq %xmm0, %xmm1
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X86-NEXT: pmuludq %xmm2, %xmm0
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X86-NEXT: movdqa %xmm1, %xmm0
@@ -132,12 +130,11 @@ define <4 x i32> @test6(<4 x i32> %x) {
define <4 x i32> @test7(<4 x i32> %x) {
; X86-LABEL: test7:
; X86: # %bb.0:
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,1749801491,1749801491]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-NEXT: pmuludq %xmm1, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT: pmuludq %xmm1, %xmm2
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
;
@@ -155,12 +152,11 @@ define <4 x i32> @test8(<4 x i32> %x) {
; X86-NEXT: movdqa %xmm0, %xmm1
; X86-NEXT: psrad $3, %xmm1
; X86-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; X86-NEXT: movdqa {{.*#+}} xmm2 = [1,1,2863311531,2863311531]
-; X86-NEXT: movapd %xmm1, %xmm0
-; X86-NEXT: pmuludq %xmm2, %xmm0
+; X86-NEXT: movdqa {{.*#+}} xmm0 = [1,1,2863311531,2863311531]
+; X86-NEXT: pmuludq %xmm1, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-NEXT: pmuludq %xmm2, %xmm1
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index a912301985132..f8649398f17c7 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -1925,7 +1925,6 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl c, %edx
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X86-SSE-NEXT: psrad $16, %xmm0
; X86-SSE-NEXT: movl $32768, %ecx # imm = 0x8000
; X86-SSE-NEXT: movd %ecx, %xmm1
@@ -1949,7 +1948,6 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq c(%rip), %rax
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-SSE-NEXT: psrad $16, %xmm0
; X64-SSE-NEXT: movl $32768, %ecx # imm = 0x8000
; X64-SSE-NEXT: movd %ecx, %xmm1
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 3f54ea3c167d1..bb7245c31b326 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -12,7 +12,7 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,3264175145,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3264175145,3264175145]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -369,7 +369,7 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3264175145,3264175145]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -450,7 +450,7 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3264175145,3264175145]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -703,7 +703,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3264175145,3264175145]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -711,7 +711,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [2147483648,2147483648,1073741824,1073741824]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [2147483648,u,1073741824,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -728,7 +728,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2147483648,268435456,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -746,7 +746,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,u,1073741824,u]
; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2147483648,268435456,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -931,7 +931,7 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3264175145,3264175145]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1219,7 +1219,7 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [3067833783,3067833783,3264175145,3264175145]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [3067833783,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -1227,7 +1227,7 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
@@ -1253,7 +1253,7 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [2147483648,2147483648,1073741824,1073741824]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [2147483648,u,1073741824,u]
; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,2147483648,2,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
@@ -1275,7 +1275,7 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [2147483648,2147483648,1073741824,1073741824]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [2147483648,u,1073741824,u]
; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1,2147483648,2,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
@@ -1335,7 +1335,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,1,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,0,3435973837,3435973837]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,u,3435973837,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1412,7 +1412,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,0,1,3067833783]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,0,3067833783,3067833783]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,u,3067833783,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1420,7 +1420,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [1,1,2147483648,2147483648]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [1,u,2147483648,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -1437,7 +1437,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,0,1,3067833783]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,2147483648,2147483648]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,2147483648,u]
; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,268435456,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1455,7 +1455,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,0,1,3067833783]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,1,2147483648,2147483648]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,u,2147483648,u]
; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,1,268435456,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1501,7 +1501,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,1,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,0,3264175145,3264175145]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1509,7 +1509,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [1,1,1073741824,1073741824]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [1,u,1073741824,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -1526,7 +1526,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,1,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,1073741824,1073741824]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,1073741824,u]
; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,268435456,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1544,7 +1544,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,1,1073741824,1073741824]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,u,1073741824,u]
; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,1,268435456,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1729,7 +1729,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,0,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,0,3264175145,3264175145]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1814,7 +1814,7 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,3435973837,3435973837]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3435973837,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1897,7 +1897,7 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,1,0,3067833783]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,3067833783,3067833783]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3067833783,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1905,7 +1905,7 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [268435456,268435456,2147483648,2147483648]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [268435456,u,2147483648,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -1922,7 +1922,7 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,1,0,3067833783]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,2147483648,2147483648]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,u,2147483648,u]
; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,268435456,1,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1940,7 +1940,7 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,1,0,3067833783]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [268435456,268435456,2147483648,2147483648]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [268435456,u,2147483648,u]
; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,268435456,1,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1986,7 +1986,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,3264175145,3264175145]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
index eb2a1f196774c..838e574eea69e 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
@@ -1264,12 +1264,10 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; SSE41-NEXT: movdqa %xmm2, %xmm4
; SSE41-NEXT: pxor %xmm0, %xmm4
; SSE41-NEXT: movdqa %xmm3, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm4
; SSE41-NEXT: por %xmm3, %xmm4
; SSE41-NEXT: pxor %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm3
@@ -1281,8 +1279,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
; SSE41-NEXT: por %xmm0, %xmm1
; SSE41-NEXT: pxor %xmm4, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
; SSE41-NEXT: movapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
@@ -1485,12 +1483,10 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE41-NEXT: movdqa %xmm4, %xmm6
; SSE41-NEXT: pxor %xmm5, %xmm6
; SSE41-NEXT: movdqa %xmm0, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: pand %xmm8, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm6
; SSE41-NEXT: por %xmm0, %xmm6
; SSE41-NEXT: pxor %xmm5, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm0
@@ -1502,10 +1498,10 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: por %xmm8, %xmm2
; SSE41-NEXT: pxor %xmm6, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807]
; SSE41-NEXT: movapd {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
; SSE41-NEXT: movapd %xmm6, %xmm8
+; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
@@ -1515,13 +1511,11 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: pxor %xmm5, %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: pand %xmm9, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
-; SSE41-NEXT: por %xmm0, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm8
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm9
+; SSE41-NEXT: por %xmm0, %xmm9
; SSE41-NEXT: pxor %xmm5, %xmm3
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pcmpgtd %xmm5, %xmm0
@@ -1531,8 +1525,8 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE41-NEXT: pand %xmm2, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: por %xmm3, %xmm2
-; SSE41-NEXT: pxor %xmm8, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pxor %xmm9, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm6
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1
@@ -1862,12 +1856,10 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE41-NEXT: movdqa %xmm8, %xmm10
; SSE41-NEXT: pxor %xmm9, %xmm10
; SSE41-NEXT: movdqa %xmm0, %xmm11
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: pand %xmm12, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm10, %xmm11
+; SSE41-NEXT: pcmpgtd %xmm10, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm11, %xmm10
; SSE41-NEXT: por %xmm0, %xmm10
; SSE41-NEXT: pxor %xmm9, %xmm4
; SSE41-NEXT: movdqa %xmm4, %xmm0
@@ -1879,10 +1871,10 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; SSE41-NEXT: por %xmm12, %xmm4
; SSE41-NEXT: pxor %xmm10, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807]
; SSE41-NEXT: movapd {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808]
; SSE41-NEXT: movapd %xmm10, %xmm12
+; SSE41-NEXT: movdqa %xmm8, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm8
@@ -1892,13 +1884,11 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE41-NEXT: movdqa %xmm1, %xmm4
; SSE41-NEXT: pxor %xmm9, %xmm4
; SSE41-NEXT: movdqa %xmm0, %xmm12
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm12
-; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: pand %xmm13, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
-; SSE41-NEXT: por %xmm0, %xmm12
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm12
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm12, %xmm13
+; SSE41-NEXT: por %xmm0, %xmm13
; SSE41-NEXT: pxor %xmm9, %xmm5
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
@@ -1908,9 +1898,9 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE41-NEXT: pand %xmm4, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; SSE41-NEXT: por %xmm5, %xmm4
-; SSE41-NEXT: pxor %xmm12, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pxor %xmm13, %xmm4
; SSE41-NEXT: movapd %xmm10, %xmm5
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
@@ -1920,25 +1910,23 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE41-NEXT: movdqa %xmm2, %xmm4
; SSE41-NEXT: pxor %xmm9, %xmm4
; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: pand %xmm12, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm0, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm12
+; SSE41-NEXT: por %xmm0, %xmm12
; SSE41-NEXT: pxor %xmm9, %xmm6
; SSE41-NEXT: movdqa %xmm6, %xmm0
; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT: pand %xmm4, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE41-NEXT: pand %xmm4, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm4
-; SSE41-NEXT: pxor %xmm5, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT: por %xmm5, %xmm4
+; SSE41-NEXT: pxor %xmm12, %xmm4
; SSE41-NEXT: movapd %xmm10, %xmm5
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
@@ -1948,24 +1936,22 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE41-NEXT: movdqa %xmm3, %xmm4
; SSE41-NEXT: pxor %xmm9, %xmm4
; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm0, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm6
+; SSE41-NEXT: por %xmm0, %xmm6
; SSE41-NEXT: pxor %xmm9, %xmm7
; SSE41-NEXT: movdqa %xmm7, %xmm0
; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm9, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE41-NEXT: pand %xmm4, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE41-NEXT: pand %xmm4, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm4
-; SSE41-NEXT: pxor %xmm5, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE41-NEXT: por %xmm5, %xmm4
+; SSE41-NEXT: pxor %xmm6, %xmm4
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm10
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3
diff --git a/llvm/test/CodeGen/X86/test-shrink-bug.ll b/llvm/test/CodeGen/X86/test-shrink-bug.ll
index 1d5da42a8c09b..ab28a3b4a2b63 100644
--- a/llvm/test/CodeGen/X86/test-shrink-bug.ll
+++ b/llvm/test/CodeGen/X86/test-shrink-bug.ll
@@ -64,7 +64,7 @@ define dso_local void @fail(i16 %a, <2 x i8> %b) {
;
; CHECK-X64-LABEL: fail:
; CHECK-X64: # %bb.0:
-; CHECK-X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-X64-NEXT: pslld $8, %xmm0
; CHECK-X64-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-X64-NEXT: pextrw $1, %xmm0, %eax
; CHECK-X64-NEXT: xorb $1, %al
diff --git a/llvm/test/CodeGen/X86/udiv-exact.ll b/llvm/test/CodeGen/X86/udiv-exact.ll
index 208ff4dd32f85..bf560fcc130ae 100644
--- a/llvm/test/CodeGen/X86/udiv-exact.ll
+++ b/llvm/test/CodeGen/X86/udiv-exact.ll
@@ -83,12 +83,11 @@ define <4 x i32> @test5(<4 x i32> %x) {
; X86-NEXT: movdqa %xmm0, %xmm1
; X86-NEXT: psrld $3, %xmm1
; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
-; X86-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,3264175145,3264175145]
-; X86-NEXT: movaps %xmm1, %xmm0
-; X86-NEXT: pmuludq %xmm2, %xmm0
+; X86-NEXT: movdqa {{.*#+}} xmm0 = [2863311531,2863311531,3264175145,3264175145]
+; X86-NEXT: pmuludq %xmm1, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-NEXT: pmuludq %xmm2, %xmm1
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
@@ -109,12 +108,11 @@ define <4 x i32> @test6(<4 x i32> %x) {
; X86-NEXT: psrld $3, %xmm1
; X86-NEXT: psrld $1, %xmm0
; X86-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X86-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,3303820997,3303820997]
-; X86-NEXT: movapd %xmm0, %xmm1
-; X86-NEXT: pmuludq %xmm2, %xmm1
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,3303820997,3303820997]
+; X86-NEXT: pmuludq %xmm0, %xmm1
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X86-NEXT: pmuludq %xmm2, %xmm0
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X86-NEXT: movdqa %xmm1, %xmm0
@@ -132,12 +130,11 @@ define <4 x i32> @test6(<4 x i32> %x) {
define <4 x i32> @test7(<4 x i32> %x) {
; X86-LABEL: test7:
; X86: # %bb.0:
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,1749801491,1749801491]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-NEXT: pmuludq %xmm1, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT: pmuludq %xmm1, %xmm2
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
;
@@ -155,12 +152,11 @@ define <4 x i32> @test8(<4 x i32> %x) {
; X86-NEXT: movdqa %xmm0, %xmm1
; X86-NEXT: psrld $3, %xmm1
; X86-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; X86-NEXT: movdqa {{.*#+}} xmm2 = [1,1,2863311531,2863311531]
-; X86-NEXT: movapd %xmm1, %xmm0
-; X86-NEXT: pmuludq %xmm2, %xmm0
+; X86-NEXT: movdqa {{.*#+}} xmm0 = [1,1,2863311531,2863311531]
+; X86-NEXT: pmuludq %xmm1, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-NEXT: pmuludq %xmm2, %xmm1
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/undo-mul-and.ll b/llvm/test/CodeGen/X86/undo-mul-and.ll
index 7308b1d30e951..6566153b8514b 100644
--- a/llvm/test/CodeGen/X86/undo-mul-and.ll
+++ b/llvm/test/CodeGen/X86/undo-mul-and.ll
@@ -65,7 +65,7 @@ define <4 x i32> @mul_and_to_neg_shl_and_vec_fail_no_splat(<4 x i32> %x) {
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [56,56,56,64]
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [56,56,64,64]
+; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [56,u,64,u]
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -94,7 +94,7 @@ define <4 x i32> @mul_and_to_neg_shl_and_vec_todo_no_splat1(<4 x i32> %x) {
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [56,56,56,48]
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [56,56,48,48]
+; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [56,u,48,u]
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index 3ddfb2aa83c75..2228c09bba906 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -11,17 +11,17 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,3264175145,3264175145]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3264175145,3264175145]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1073741824,1073741824]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: por %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: por %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_even:
@@ -160,22 +160,18 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3067833783,3067833783]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,4294967295,3067833783]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1,2147483648]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; CHECK-SSE2-NEXT: por %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
@@ -242,22 +238,18 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3067833783,3067833783]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,4294967295,3067833783]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1,2147483648]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; CHECK-SSE2-NEXT: por %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
; CHECK-SSE2-NEXT: retq
@@ -326,17 +318,17 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,4294967295,3264175145]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3264175145,3264175145]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1073741824,1073741824]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: por %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: por %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_even_allones_eq:
@@ -399,17 +391,17 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,4294967295,3264175145]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3264175145,3264175145]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1073741824,1073741824]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: por %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: por %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: psrld $31, %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_even_allones_ne:
@@ -475,11 +467,9 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
@@ -547,22 +537,18 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3067833783,3067833783]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,1,3067833783]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,268435456,2147483648]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; CHECK-SSE2-NEXT: por %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
@@ -631,14 +617,10 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3264175145,3264175145]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2147483648,268435456,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -654,7 +636,7 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2147483648,268435456,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -671,7 +653,7 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,u,1073741824,u]
; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2147483648,268435456,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -841,17 +823,17 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3264175145,3264175145]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1073741824,1073741824]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: por %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: por %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_even_one:
@@ -917,11 +899,9 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_INT_MIN:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,2,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
@@ -989,22 +969,18 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3067833783,3067833783]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,1,3067833783]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,2,2147483648]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; CHECK-SSE2-NEXT: por %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
@@ -1073,14 +1049,10 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3264175145,3264175145]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2147483648,2,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1096,7 +1068,7 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2147483648,2,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1113,7 +1085,7 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,u,1073741824,u]
; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2147483648,2,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1157,11 +1129,9 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,3435973837]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,4294967295,3435973837,3435973837]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3435973837,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,3435973837]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
@@ -1229,14 +1199,10 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,1,3067833783]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,4294967295,3067833783,3067833783]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,268435456,2147483648]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,2147483648,2147483648]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3067833783,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,2147483648,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1252,7 +1218,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,1,3067833783]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,2147483648,2147483648]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,2147483648,u]
; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,268435456,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1269,7 +1235,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,1,3067833783]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,1,2147483648,2147483648]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,u,2147483648,u]
; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,1,268435456,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1312,14 +1278,10 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,3264175145]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,4294967295,3264175145,3264175145]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,268435456,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,1073741824,1073741824]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,1073741824,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1335,7 +1297,7 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,3264175145]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,1073741824,1073741824]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,1073741824,u]
; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,268435456,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1352,7 +1314,7 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,3264175145]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,1,1073741824,1073741824]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,u,1073741824,u]
; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,1,268435456,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1398,7 +1360,7 @@ define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,0,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,4294967295,3435973837,3435973837]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3435973837,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1434,14 +1396,10 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,0,3067833783]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,4294967295,3067833783,3067833783]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,1,2147483648]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,2147483648,2147483648]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3067833783,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,2147483648,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1457,7 +1415,7 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,0,3067833783]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,2147483648,2147483648]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,2147483648,u]
; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,1,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1474,7 +1432,7 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,0,3067833783]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,1,2147483648,2147483648]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,u,2147483648,u]
; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,1,1,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1517,17 +1475,17 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,0,3264175145]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,4294967295,3264175145,3264175145]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,1073741824,1073741824]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: por %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,1073741824,1073741824]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: por %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_one:
@@ -1594,17 +1552,17 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3435973837]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,3435973837,3435973837]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,268435456,1,1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: por %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3435973837,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,1,1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: por %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo_and_one:
@@ -1669,14 +1627,10 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,1,0,3067833783]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,3067833783,3067833783]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,268435456,1,2147483648]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,2147483648,2147483648]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3067833783,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,u,2147483648,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1692,7 +1646,7 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,1,0,3067833783]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,2147483648,2147483648]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,u,2147483648,u]
; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,268435456,1,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1709,7 +1663,7 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,1,0,3067833783]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [268435456,268435456,2147483648,2147483648]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [268435456,u,2147483648,u]
; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,268435456,1,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1752,17 +1706,17 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3264175145]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,3264175145,3264175145]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,268435456,1073741824,1073741824]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: por %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,1073741824,1073741824]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: por %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo_and_one:
@@ -1827,11 +1781,9 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,0]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,4294967295,0,0]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,0,0,0]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,0]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
@@ -1897,11 +1849,9 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,1,0]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,4294967295,0,0]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,0,0,0]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,1,0]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,u,268435456,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
index cff95b5365f39..b490c3cfefb76 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
@@ -141,7 +141,7 @@ define <4 x i32> @test_urem_odd_neg25(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,1030792151,1030792151,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1030792151,1030792151,3264175145,3264175145]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1030792151,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
diff --git a/llvm/test/CodeGen/X86/vec_minmax_sint.ll b/llvm/test/CodeGen/X86/vec_minmax_sint.ll
index c8d988cb011ae..60037d3afcfe1 100644
--- a/llvm/test/CodeGen/X86/vec_minmax_sint.ll
+++ b/llvm/test/CodeGen/X86/vec_minmax_sint.ll
@@ -42,10 +42,9 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -130,10 +129,9 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT: pand %xmm8, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
@@ -142,10 +140,9 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: movapd %xmm3, %xmm1
@@ -429,10 +426,9 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -517,10 +513,9 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT: pand %xmm8, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
@@ -529,10 +524,9 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: movapd %xmm3, %xmm1
@@ -816,10 +810,9 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -904,10 +897,9 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT: pand %xmm8, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
@@ -916,10 +908,9 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: movapd %xmm3, %xmm1
@@ -1204,10 +1195,9 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -1292,10 +1282,9 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT: pand %xmm8, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
@@ -1304,10 +1293,9 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: movapd %xmm3, %xmm1
diff --git a/llvm/test/CodeGen/X86/vec_minmax_uint.ll b/llvm/test/CodeGen/X86/vec_minmax_uint.ll
index c9bfff4b7dfd7..839ea95671a78 100644
--- a/llvm/test/CodeGen/X86/vec_minmax_uint.ll
+++ b/llvm/test/CodeGen/X86/vec_minmax_uint.ll
@@ -42,10 +42,9 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -141,10 +140,9 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT: pand %xmm8, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
@@ -153,10 +151,9 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: movapd %xmm3, %xmm1
@@ -456,10 +453,9 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -555,10 +551,9 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT: pand %xmm8, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
@@ -567,10 +562,9 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: movapd %xmm3, %xmm1
@@ -870,10 +864,9 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -968,10 +961,9 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT: pand %xmm8, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
@@ -980,10 +972,9 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: movapd %xmm3, %xmm1
@@ -1287,10 +1278,9 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -1385,10 +1375,9 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT: pand %xmm8, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
@@ -1397,10 +1386,9 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: movapd %xmm3, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-compare-all_of.ll b/llvm/test/CodeGen/X86/vector-compare-all_of.ll
index 8995989ef4474..bf027a7346deb 100644
--- a/llvm/test/CodeGen/X86/vector-compare-all_of.ll
+++ b/llvm/test/CodeGen/X86/vector-compare-all_of.ll
@@ -272,12 +272,10 @@ define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) {
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: movmskpd %xmm1, %ecx
; SSE2-NEXT: xorl %eax, %eax
@@ -319,25 +317,21 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
; SSE2-NEXT: pxor %xmm4, %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
+; SSE2-NEXT: pand %xmm5, %xmm3
; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: movmskpd %xmm1, %ecx
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: movmskpd %xmm2, %ecx
; SSE2-NEXT: xorl %eax, %eax
; SSE2-NEXT: cmpl $3, %ecx
; SSE2-NEXT: sete %al
diff --git a/llvm/test/CodeGen/X86/vector-compare-any_of.ll b/llvm/test/CodeGen/X86/vector-compare-any_of.ll
index 24fee35c73404..2df39d69dbb75 100644
--- a/llvm/test/CodeGen/X86/vector-compare-any_of.ll
+++ b/llvm/test/CodeGen/X86/vector-compare-any_of.ll
@@ -223,12 +223,10 @@ define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) {
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: movmskpd %xmm1, %ecx
; SSE2-NEXT: xorl %eax, %eax
@@ -268,25 +266,21 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
; SSE2-NEXT: pxor %xmm4, %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
+; SSE2-NEXT: pand %xmm5, %xmm3
; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: movmskpd %xmm1, %ecx
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movmskpd %xmm2, %ecx
; SSE2-NEXT: xorl %eax, %eax
; SSE2-NEXT: negl %ecx
; SSE2-NEXT: sbbq %rax, %rax
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index db7255079fb1f..a0c27601a845d 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -1318,34 +1318,19 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
}
define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
-; SSE2-LABEL: splatvar_funnnel_v16i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: psllw %xmm2, %xmm3
-; SSE2-NEXT: psrlw $8, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: psllw %xmm2, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: packuswb %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatvar_funnnel_v16i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE41-NEXT: psllw %xmm2, %xmm3
-; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE41-NEXT: psllw %xmm2, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: packuswb %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: splatvar_funnnel_v16i8:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT: psllw %xmm2, %xmm3
+; SSE-NEXT: psrlw $8, %xmm3
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT: psllw %xmm2, %xmm1
+; SSE-NEXT: psrlw $8, %xmm1
+; SSE-NEXT: packuswb %xmm3, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: splatvar_funnnel_v16i8:
; AVX: # %bb.0:
@@ -1443,7 +1428,6 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
;
; X86-SSE2-LABEL: splatvar_funnnel_v16i8:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
@@ -1839,7 +1823,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,32,128,128]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,u,128,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: por %xmm1, %xmm0
@@ -1953,7 +1937,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,64,128]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [32,32,128,128]
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [32,u,128,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X86-SSE2-NEXT: por %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
index b0c225dd1ee0e..ec2efcd82395a 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
@@ -1056,32 +1056,18 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
}
define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
-; SSE2-LABEL: splatvar_funnnel_v16i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: psllw %xmm1, %xmm2
-; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psllw %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatvar_funnnel_v16i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE41-NEXT: psllw %xmm1, %xmm2
-; SSE41-NEXT: psrlw $8, %xmm2
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT: psllw %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: packuswb %xmm2, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: splatvar_funnnel_v16i8:
+; SSE: # %bb.0:
+; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE-NEXT: psllw %xmm1, %xmm2
+; SSE-NEXT: psrlw $8, %xmm2
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: psllw %xmm1, %xmm0
+; SSE-NEXT: psrlw $8, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: splatvar_funnnel_v16i8:
; AVX: # %bb.0:
@@ -1182,10 +1168,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
;
; X86-SSE2-LABEL: splatvar_funnnel_v16i8:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: psllw %xmm1, %xmm2
; X86-SSE2-NEXT: psrlw $8, %xmm2
; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -1319,7 +1304,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,32,128,128]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,128,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1331,7 +1316,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; SSE41-LABEL: constant_funnnel_v4i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,32,128,128]
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,128,u]
; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1343,7 +1328,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; AVX1-LABEL: constant_funnnel_v4i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,32,128,128]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,128,u]
; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1411,7 +1396,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,64,128]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,32,128,128]
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,u,128,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
index d24b8d380ef21..5f7e4070b3783 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -1082,10 +1082,10 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind {
; AVX1-LABEL: constant_funnnel_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,32,128,128]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,128,u]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [512,512,2048,2048]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [512,u,2048,u]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [256,512,1024,2048]
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
index 65b8c5b2002e8..304daab6d17a9 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
@@ -321,7 +321,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,32,1,1]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,1,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -333,7 +333,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; SSE41-LABEL: constant_funnnel_v2i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,32,1,1]
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,1,u]
; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -345,7 +345,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX1-LABEL: constant_funnnel_v2i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,32,1,1]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,1,u]
; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,1,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -413,7 +413,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,1,1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,32,1,1]
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,u,1,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
index 7bccfc3582927..ae5dd18d4b663 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
@@ -502,7 +502,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,32,1,1]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,u,1,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: por %xmm1, %xmm0
@@ -600,7 +600,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,1,1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [32,32,1,1]
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [32,u,1,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X86-SSE2-NEXT: por %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index a5efd829db000..33a6a7679bb9a 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -1446,7 +1446,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
; SSE2-LABEL: splatvar_funnnel_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
@@ -1588,7 +1587,6 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
;
; X86-SSE2-LABEL: splatvar_funnnel_v16i8:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
@@ -1745,7 +1743,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,67108864,33554432]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [134217728,134217728,33554432,33554432]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [134217728,u,33554432,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: por %xmm1, %xmm0
@@ -1860,7 +1858,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [268435456,134217728,67108864,33554432]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [134217728,134217728,33554432,33554432]
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [134217728,u,33554432,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X86-SSE2-NEXT: por %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index 560a590a1b091..5d01dfd459f8c 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -1101,10 +1101,9 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
; SSE2-LABEL: splatvar_funnnel_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: psrlw %xmm1, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm3, %xmm2
@@ -1246,10 +1245,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
;
; X86-SSE2-LABEL: splatvar_funnnel_v16i8:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: psrlw %xmm1, %xmm2
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; X86-SSE2-NEXT: pand %xmm3, %xmm2
@@ -1384,7 +1382,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,67108864,33554432]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,134217728,33554432,33554432]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,33554432,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1396,7 +1394,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; SSE41-LABEL: constant_funnnel_v4i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,134217728,33554432,33554432]
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,33554432,u]
; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,67108864,33554432]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1408,7 +1406,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; AVX1-LABEL: constant_funnnel_v4i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,134217728,33554432,33554432]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,u,33554432,u]
; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,67108864,33554432]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1476,7 +1474,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [268435456,134217728,67108864,33554432]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [134217728,134217728,33554432,33554432]
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [134217728,u,33554432,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index 3172ecb8ae08f..4dc931dd304fb 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -1134,10 +1134,10 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind {
; AVX1-LABEL: constant_funnnel_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,134217728,33554432,33554432]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,u,33554432,u]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [8388608,8388608,2097152,2097152]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [8388608,u,2097152,u]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,67108864,33554432]
; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [16777216,8388608,4194304,2097152]
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
index d8569e31835e8..4b42b189538ac 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
@@ -343,7 +343,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,1,1]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,134217728,1,1]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,1,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -355,7 +355,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; SSE41-LABEL: constant_funnnel_v2i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,134217728,1,1]
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,1,u]
; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,1,1]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -367,7 +367,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX1-LABEL: constant_funnnel_v2i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,134217728,1,1]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,u,1,u]
; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,1,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -435,7 +435,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [268435456,134217728,1,1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [134217728,134217728,1,1]
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [134217728,u,1,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index 09b97df9afead..3acc94d6e1fc4 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -3422,9 +3422,9 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512-NEXT: vmovdqa (%r10), %xmm1
-; AVX512-NEXT: vmovdqa 16(%r10), %xmm12
+; AVX512-NEXT: vmovdqa 16(%r10), %xmm11
; AVX512-NEXT: vmovdqa (%rax), %xmm5
-; AVX512-NEXT: vmovdqa 16(%rax), %xmm13
+; AVX512-NEXT: vmovdqa 16(%rax), %xmm12
; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
; AVX512-NEXT: vmovdqa64 %xmm1, %xmm22
; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
@@ -3436,117 +3436,114 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vmovdqa (%r8), %xmm7
; AVX512-NEXT: vmovdqa 16(%r8), %xmm15
; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,5,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,6,5,7,7]
-; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm4, %ymm4
-; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7],ymm4[8,9,10],ymm1[11],ymm4[12,13,14],ymm1[15]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,0,2,1,4,5,6,7]
+; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3
+; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,1,3,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15]
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16
; AVX512-NEXT: vmovdqa (%rsi), %xmm1
; AVX512-NEXT: vmovdqa (%rdi), %xmm2
; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; AVX512-NEXT: vmovdqa64 %xmm2, %xmm20
; AVX512-NEXT: vmovdqa64 %xmm1, %xmm21
+; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX512-NEXT: vmovdqa (%rcx), %xmm8
; AVX512-NEXT: vmovdqa (%rdx), %xmm9
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm4, %ymm4
-; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm11 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7]
-; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3,4],ymm1[5],ymm11[6,7,8],ymm1[9],ymm11[10,11,12],ymm1[13],ymm11[14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0,0,1,1,2,2,3,3]
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm13 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero
+; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7,8],ymm13[9],ymm3[10,11,12],ymm13[13],ymm3[14,15]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,4,6,5]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7]
+; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10
+; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7,8],ymm4[9],ymm0[10,11,12],ymm4[13],ymm0[14,15]
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm19
-; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7,8],ymm10[9],ymm0[10,11,12],ymm10[13],ymm0[14,15]
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm18
+; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,4,6,6,7]
+; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5,5,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,6,5,7,7]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT: vmovdqa 16(%rcx), %xmm2
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7],ymm1[8,9,10],ymm3[11],ymm1[12,13,14],ymm3[15]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,1,3,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm13[2,1,3,3,4,5,6,7]
+; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm3, %ymm3
+; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15]
+; AVX512-NEXT: vmovdqa 16(%rdx), %xmm10
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17
+; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7]
+; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm1, %ymm19
+; AVX512-NEXT: vmovdqa 16(%rsi), %xmm3
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm13
+; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
+; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7,8],ymm0[9],ymm4[10,11,12],ymm0[13],ymm4[14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm19[2,1,3,3,6,5,7,7]
+; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7,8],ymm4[9],ymm1[10,11,12],ymm4[13],ymm1[14,15]
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7]
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,5,5,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,6,5,7,7]
-; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,0,2,1,4,5,6,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm4[0,1,2,3,6,5,7,7]
+; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm11, %ymm11
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,0,2,1,4,5,6,7]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm11, %ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0
; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm1[3],ymm10[4,5,6],ymm1[7],ymm10[8,9,10],ymm1[11],ymm10[12,13,14],ymm1[15]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,1,3,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3],ymm11[4,5,6],ymm1[7],ymm11[8,9,10],ymm1[11],ymm11[12,13,14],ymm1[15]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[0,1,1,3,4,5,6,7]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7]
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm11, %ymm4
; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15]
-; AVX512-NEXT: vmovdqa 16(%rcx), %xmm2
-; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm18
-; AVX512-NEXT: vmovdqa 16(%rdx), %xmm11
-; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,4,6,5]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512-NEXT: vinserti32x4 $1, %xmm10, %ymm4, %ymm16
-; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm3, %xmm23
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT: vmovdqa 16(%rsi), %xmm4
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7,8],ymm0[9],ymm3[10,11,12],ymm0[13],ymm3[14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[2,3,2,3]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm16[2,1,3,3,6,5,7,7]
-; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4],ymm10[5],ymm3[6,7,8],ymm10[9],ymm3[10,11,12],ymm10[13],ymm3[14,15]
-; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm16
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm3, %ymm3
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,5,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,6,5,7,7]
-; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,0,2,1,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm13, %ymm0
-; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1,2],ymm3[3],ymm12[4,5,6],ymm3[7],ymm12[8,9,10],ymm3[11],ymm12[12,13,14],ymm3[15]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[0,1,1,3,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,1,3,3,4,5,6,7]
-; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10
-; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,2,1,4,4,6,5]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3],ymm10[4,5,6],ymm0[7],ymm10[8,9,10],ymm0[11],ymm10[12,13,14],ymm0[15]
-; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm12
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
-; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX512-NEXT: vmovdqa64 %xmm23, %xmm10
-; AVX512-NEXT: vpshufb %xmm10, %xmm0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15]
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7]
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm3[8],xmm13[9],xmm3[9],xmm13[10],xmm3[10],xmm13[11],xmm3[11],xmm13[12],xmm3[12],xmm13[13],xmm3[13],xmm13[14],xmm3[14],xmm13[15],xmm3[15]
+; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7,8],ymm0[9],ymm3[10,11,12],ymm0[13],ymm3[14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15]
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vmovdqa64 %xmm22, %xmm1
; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
@@ -3577,8 +3574,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vmovdqa64 %xmm20, %xmm4
; AVX512-NEXT: vmovdqa64 %xmm21, %xmm5
; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm2
; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3,4],ymm2[5],ymm5[6,7,8],ymm2[9],ymm5[10,11,12],ymm2[13],ymm5[14,15]
; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7]
@@ -3589,14 +3586,14 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: movw $-21846, %cx # imm = 0xAAAA
; AVX512-NEXT: kmovw %ecx, %k1
+; AVX512-NEXT: vmovdqa32 %zmm16, %zmm18 {%k1}
; AVX512-NEXT: vmovdqa32 %zmm17, %zmm19 {%k1}
-; AVX512-NEXT: vmovdqa32 %zmm18, %zmm16 {%k1}
-; AVX512-NEXT: vmovdqa32 %zmm12, %zmm0 {%k1}
+; AVX512-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1}
; AVX512-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1}
; AVX512-NEXT: vmovdqa64 %zmm2, (%rax)
; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm16, 128(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rax)
+; AVX512-NEXT: vmovdqa64 %zmm19, 128(%rax)
+; AVX512-NEXT: vmovdqa64 %zmm18, 64(%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -3769,165 +3766,160 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-NEXT: vmovdqa (%r10), %xmm0
-; AVX512DQ-NEXT: vmovdqa 16(%r10), %xmm10
+; AVX512DQ-NEXT: vmovdqa 16(%r10), %xmm9
; AVX512DQ-NEXT: vmovdqa (%rax), %xmm2
-; AVX512DQ-NEXT: vmovdqa 16(%rax), %xmm11
+; AVX512DQ-NEXT: vmovdqa 16(%rax), %xmm10
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm23
-; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm24
+; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm22
+; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm23
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vmovdqa (%r9), %xmm2
-; AVX512DQ-NEXT: vmovdqa 16(%r9), %xmm12
+; AVX512DQ-NEXT: vmovdqa (%r9), %xmm4
+; AVX512DQ-NEXT: vmovdqa 16(%r9), %xmm11
; AVX512DQ-NEXT: vmovdqa (%r8), %xmm6
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
-; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm25
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,5,5,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,6,5,7,7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7],ymm7[8,9,10],ymm0[11],ymm7[12,13,14],ymm0[15]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7],ymm7[8,9,10],ymm0[11],ymm7[12,13,14],ymm0[15]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,0,2,1,4,5,6,7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm13
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,1,1,3,4,5,6,7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm5
-; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0
-; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm21
-; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm22
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7],ymm5[8,9,10],ymm1[11],ymm5[12,13,14],ymm1[15]
+; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2
; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm7
; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm8
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,4,4,6,5]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7]
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm15, %xmm15
-; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm15 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7,8],ymm15[9],ymm1[10,11,12],ymm15[13],ymm1[14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,4,6,5]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm13, %ymm13
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm24
+; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm25
+; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm15 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3]
+; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm12 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7,8],ymm12[9],ymm15[10,11,12],ymm12[13],ymm15[14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,3,2,3]
; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2,3,4],ymm0[5],ymm14[6,7,8],ymm0[9],ymm14[10,11,12],ymm0[13],ymm14[14,15]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7,8],ymm13[9],ymm14[10,11,12],ymm13[13],ymm14[14,15]
; AVX512DQ-NEXT: vmovdqa 16(%r8), %xmm14
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm16
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm13[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm16
; AVX512DQ-NEXT: movw $-21846, %ax # imm = 0xAAAA
; AVX512DQ-NEXT: kmovw %eax, %k1
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm16 {%k1}
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm1, %ymm20
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,6,5,7,7]
-; AVX512DQ-NEXT: vinserti32x4 $1, %xmm13, %ymm5, %ymm18
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,1,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm5, %ymm19
-; AVX512DQ-NEXT: vmovdqa 16(%rcx), %xmm15
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vmovdqa 16(%rdx), %xmm3
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm5[0,1,2,3,4,4,6,5]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm5[0,1,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm13, %ymm9
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm5, %xmm5
-; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm26
-; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
-; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm5
-; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm13
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3],xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7]
-; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3,4],ymm1[5],ymm4[6,7,8],ymm1[9],ymm4[10,11,12],ymm1[13],ymm4[14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,1,3,3,6,5,7,7]
-; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7,8],ymm4[9],ymm0[10,11,12],ymm4[13],ymm0[14,15]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm17
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm20[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15]
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm1, %zmm17 {%k1}
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,6,5,7,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm4
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,0,2,1,4,5,6,7]
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm5, %zmm16 {%k1}
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,4,6,5]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vinserti32x4 $1, %xmm12, %ymm5, %ymm17
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm5[0,1,2,3,6,5,7,7]
+; AVX512DQ-NEXT: vinserti32x4 $1, %xmm13, %ymm12, %ymm18
+; AVX512DQ-NEXT: vmovdqa 16(%rcx), %xmm13
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,0,2,1,4,5,6,7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm9, %ymm2
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,4,6,5]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm12, %ymm19
+; AVX512DQ-NEXT: vmovdqa 16(%rdx), %xmm15
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7]
+; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm0, %ymm20
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,4,6,5]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vinserti32x4 $1, %xmm12, %ymm0, %ymm21
+; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm1
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
+; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm5
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,3,2,3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm21[2,1,3,3,6,5,7,7]
+; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7,8],ymm12[9],ymm2[10,11,12],ymm12[13],ymm2[14,15]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm12
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15]
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm2, %zmm12 {%k1}
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,6,5,7,7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7],ymm4[8,9,10],ymm1[11],ymm4[12,13,14],ymm1[15]
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15]
-; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm10
-; AVX512DQ-NEXT: vpshufb %xmm10, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm9[2,1,3,3,6,5,7,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[0,0,2,1,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,4,4,6,5]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm11, %ymm11
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15]
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
+; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0,0,1,1,2,2,3,3]
+; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2,3,4],ymm9[5],ymm5[6,7,8],ymm9[9],ymm5[10,11,12],ymm9[13],ymm5[14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[2,1,3,3,6,5,7,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7,8],ymm9[9],ymm1[10,11,12],ymm9[13],ymm1[14,15]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15]
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1}
-; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm0
-; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm1
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm2
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,6,5,7,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15]
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1}
+; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm0
+; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm2
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,7,7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,1,4,5,6,7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,6,7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7],ymm4[8,9,10],ymm1[11],ymm4[12,13,14],ymm1[15]
-; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm4
-; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm7
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15]
+; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm4
+; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm7
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
-; AVX512DQ-NEXT: vpshufb %xmm10, %xmm5, %xmm5
; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7,8],ymm5[9],ymm7[10,11,12],ymm5[13],ymm7[14,15]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7]
@@ -3936,13 +3928,13 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15]
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1}
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15]
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm4 {%k1}
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm17, 128(%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm12, 128(%rax)
; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll
index 1b6fb8d499498..d0bb90c5fc8ab 100644
--- a/llvm/test/CodeGen/X86/vector-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-mul.ll
@@ -132,7 +132,7 @@ define <4 x i32> @mul_v4i32_1_2_4_8(<4 x i32> %a0) nounwind {
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [2,2,8,8]
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [2,u,8,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE2-NEXT: retl
@@ -147,7 +147,7 @@ define <4 x i32> @mul_v4i32_1_2_4_8(<4 x i32> %a0) nounwind {
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2,2,8,8]
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2,u,8,u]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-SSE2-NEXT: retq
@@ -1001,7 +1001,7 @@ define <4 x i32> @mul_v4i32_5_17_33_65(<4 x i32> %a0) nounwind {
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [5,17,33,65]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [17,17,65,65]
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [17,u,65,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE2-NEXT: retl
@@ -1016,7 +1016,7 @@ define <4 x i32> @mul_v4i32_5_17_33_65(<4 x i32> %a0) nounwind {
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [5,17,33,65]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [17,17,65,65]
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [17,u,65,u]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-SSE2-NEXT: retq
@@ -1775,7 +1775,7 @@ define <4 x i32> @mul_v4i32_0_15_31_7(<4 x i32> %a0) nounwind {
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,15,31,7]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [15,15,7,7]
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [15,u,7,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE2-NEXT: retl
@@ -1790,7 +1790,7 @@ define <4 x i32> @mul_v4i32_0_15_31_7(<4 x i32> %a0) nounwind {
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,15,31,7]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [15,15,7,7]
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [15,u,7,u]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-SSE2-NEXT: retq
@@ -2027,8 +2027,8 @@ define <2 x i64> @mul_v2i64_zext_cross_bb(ptr %in, ptr %y) {
; X86-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X86-SSE2-NEXT: pxor %xmm1, %xmm1
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3]
+; X86-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0,0,1,1]
; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0
; X86-SSE2-NEXT: retl
;
diff --git a/llvm/test/CodeGen/X86/vector-pcmp.ll b/llvm/test/CodeGen/X86/vector-pcmp.ll
index 084a1faa516fe..30eb2279bda85 100644
--- a/llvm/test/CodeGen/X86/vector-pcmp.ll
+++ b/llvm/test/CodeGen/X86/vector-pcmp.ll
@@ -473,10 +473,9 @@ define <2 x i64> @cmpgt_zext_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll
index b0ad27e3a790a..008e3e4c217cb 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll
@@ -1087,13 +1087,11 @@ define double @test_v4f64(<4 x double> %a0) {
;
; SSE41-LABEL: test_v4f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movaps %xmm0, %xmm2
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: movaps %xmm2, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movapd %xmm0, %xmm2
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT: blendvpd %xmm0, %xmm0, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm3
-; SSE41-NEXT: maxpd %xmm4, %xmm3
+; SSE41-NEXT: maxpd %xmm2, %xmm3
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: cmpunordpd %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
@@ -1273,45 +1271,45 @@ define double @test_v8f64(<8 x double> %a0) {
;
; SSE41-LABEL: test_v8f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movaps %xmm0, %xmm4
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: movaps %xmm1, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
+; SSE41-NEXT: movapd %xmm0, %xmm4
+; SSE41-NEXT: movapd %xmm1, %xmm6
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm1
-; SSE41-NEXT: maxpd %xmm5, %xmm1
+; SSE41-NEXT: movapd %xmm3, %xmm5
+; SSE41-NEXT: maxpd %xmm6, %xmm5
; SSE41-NEXT: movapd %xmm3, %xmm0
; SSE41-NEXT: cmpunordpd %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: movaps %xmm4, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
+; SSE41-NEXT: movapd %xmm4, %xmm3
+; SSE41-NEXT: movapd %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
-; SSE41-NEXT: movapd %xmm2, %xmm4
-; SSE41-NEXT: maxpd %xmm3, %xmm4
+; SSE41-NEXT: movapd %xmm2, %xmm1
+; SSE41-NEXT: maxpd %xmm3, %xmm1
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: cmpunordpd %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: movapd %xmm4, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm3
-; SSE41-NEXT: maxpd %xmm2, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm2
; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: cmpunordpd %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm2
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; SSE41-NEXT: movq %xmm3, %rax
+; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
+; SSE41-NEXT: movapd %xmm5, %xmm1
+; SSE41-NEXT: maxpd %xmm2, %xmm1
+; SSE41-NEXT: movapd %xmm5, %xmm0
+; SSE41-NEXT: cmpunordpd %xmm5, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm2
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE41-NEXT: movq %xmm1, %rax
; SSE41-NEXT: testq %rax, %rax
; SSE41-NEXT: js .LBB7_1
; SSE41-NEXT: # %bb.2:
-; SSE41-NEXT: movapd %xmm3, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: jmp .LBB7_3
; SSE41-NEXT: .LBB7_1:
; SSE41-NEXT: movapd %xmm2, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm2
+; SSE41-NEXT: movapd %xmm1, %xmm2
; SSE41-NEXT: .LBB7_3:
; SSE41-NEXT: movapd %xmm2, %xmm1
; SSE41-NEXT: maxsd %xmm0, %xmm1
@@ -1570,81 +1568,82 @@ define double @test_v16f64(<16 x double> %a0) {
;
; SSE41-LABEL: test_v16f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movaps %xmm0, %xmm8
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: movaps %xmm3, %xmm9
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9
+; SSE41-NEXT: movapd %xmm1, %xmm8
+; SSE41-NEXT: movapd %xmm0, %xmm1
+; SSE41-NEXT: movapd %xmm3, %xmm10
+; SSE41-NEXT: movapd %xmm3, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm10
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
-; SSE41-NEXT: movapd %xmm7, %xmm3
-; SSE41-NEXT: maxpd %xmm9, %xmm3
+; SSE41-NEXT: movapd %xmm7, %xmm9
+; SSE41-NEXT: maxpd %xmm10, %xmm9
; SSE41-NEXT: movapd %xmm7, %xmm0
; SSE41-NEXT: cmpunordpd %xmm7, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm3
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: movaps %xmm1, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9
+; SSE41-NEXT: movapd %xmm8, %xmm7
+; SSE41-NEXT: movapd %xmm8, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
-; SSE41-NEXT: movapd %xmm5, %xmm1
-; SSE41-NEXT: maxpd %xmm7, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5
+; SSE41-NEXT: movapd %xmm5, %xmm3
+; SSE41-NEXT: maxpd %xmm7, %xmm3
; SSE41-NEXT: movapd %xmm5, %xmm0
; SSE41-NEXT: cmpunordpd %xmm5, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: movapd %xmm1, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm1
-; SSE41-NEXT: maxpd %xmm5, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3
+; SSE41-NEXT: movapd %xmm3, %xmm5
; SSE41-NEXT: movapd %xmm3, %xmm0
-; SSE41-NEXT: cmpunordpd %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: movaps %xmm2, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm5
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9
+; SSE41-NEXT: movapd %xmm9, %xmm3
+; SSE41-NEXT: maxpd %xmm5, %xmm3
+; SSE41-NEXT: movapd %xmm9, %xmm0
+; SSE41-NEXT: cmpunordpd %xmm9, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm3
+; SSE41-NEXT: movapd %xmm2, %xmm5
+; SSE41-NEXT: movapd %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6
; SSE41-NEXT: movapd %xmm6, %xmm2
-; SSE41-NEXT: maxpd %xmm3, %xmm2
+; SSE41-NEXT: maxpd %xmm5, %xmm2
; SSE41-NEXT: movapd %xmm6, %xmm0
; SSE41-NEXT: cmpunordpd %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; SSE41-NEXT: movaps %xmm8, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm5
-; SSE41-NEXT: maxpd %xmm3, %xmm5
+; SSE41-NEXT: movapd %xmm1, %xmm5
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
+; SSE41-NEXT: movapd %xmm4, %xmm1
+; SSE41-NEXT: maxpd %xmm5, %xmm1
; SSE41-NEXT: movapd %xmm4, %xmm0
; SSE41-NEXT: cmpunordpd %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT: movapd %xmm5, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
-; SSE41-NEXT: movapd %xmm2, %xmm4
-; SSE41-NEXT: maxpd %xmm3, %xmm4
-; SSE41-NEXT: movapd %xmm2, %xmm0
-; SSE41-NEXT: cmpunordpd %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm4
+; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: movapd %xmm4, %xmm2
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm3
-; SSE41-NEXT: maxpd %xmm2, %xmm3
+; SSE41-NEXT: movapd %xmm2, %xmm1
+; SSE41-NEXT: maxpd %xmm4, %xmm1
+; SSE41-NEXT: movapd %xmm2, %xmm0
+; SSE41-NEXT: cmpunordpd %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm2
; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: cmpunordpd %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm2
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; SSE41-NEXT: movq %xmm3, %rax
+; SSE41-NEXT: movapd %xmm3, %xmm1
+; SSE41-NEXT: maxpd %xmm2, %xmm1
+; SSE41-NEXT: movapd %xmm3, %xmm0
+; SSE41-NEXT: cmpunordpd %xmm3, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm2
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE41-NEXT: movq %xmm1, %rax
; SSE41-NEXT: testq %rax, %rax
; SSE41-NEXT: js .LBB8_1
; SSE41-NEXT: # %bb.2:
-; SSE41-NEXT: movapd %xmm3, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: jmp .LBB8_3
; SSE41-NEXT: .LBB8_1:
; SSE41-NEXT: movapd %xmm2, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm2
+; SSE41-NEXT: movapd %xmm1, %xmm2
; SSE41-NEXT: .LBB8_3:
; SSE41-NEXT: movapd %xmm2, %xmm1
; SSE41-NEXT: maxsd %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll
index 5e3ef32ef7e4a..f9b37a068862c 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll
@@ -19,8 +19,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrlq $32, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; SSE-NEXT: pmuludq %xmm0, %xmm3
; SSE-NEXT: paddq %xmm2, %xmm3
; SSE-NEXT: psllq $32, %xmm3
@@ -34,7 +33,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1OR2-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX1OR2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX1OR2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX1OR2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX1OR2-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX1OR2-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -48,7 +47,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -62,7 +61,7 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -107,8 +106,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrlq $32, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; SSE-NEXT: pmuludq %xmm0, %xmm3
; SSE-NEXT: paddq %xmm2, %xmm3
; SSE-NEXT: psllq $32, %xmm3
@@ -131,7 +129,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -155,7 +153,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -179,7 +177,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -203,7 +201,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -274,8 +272,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrlq $32, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; SSE-NEXT: pmuludq %xmm0, %xmm3
; SSE-NEXT: paddq %xmm2, %xmm3
; SSE-NEXT: psllq $32, %xmm3
@@ -315,7 +312,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -347,7 +344,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -380,7 +377,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -413,7 +410,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -527,8 +524,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrlq $32, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; SSE-NEXT: pmuludq %xmm0, %xmm3
; SSE-NEXT: paddq %xmm2, %xmm3
; SSE-NEXT: psllq $32, %xmm3
@@ -602,7 +598,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -650,7 +646,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -691,7 +687,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -732,7 +728,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -808,10 +804,8 @@ define i32 @test_v4i32(<4 x i32> %a0) {
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm3
; SSE2-NEXT: pmuludq %xmm0, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4i32:
@@ -838,21 +832,15 @@ define i32 @test_v4i32(<4 x i32> %a0) {
define i32 @test_v8i32(<8 x i32> %a0) {
; SSE2-LABEL: test_v8i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm3
; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; SSE2-NEXT: pmuludq %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pmuludq %xmm0, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
-; SSE2-NEXT: pmuludq %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm0
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
@@ -910,32 +898,18 @@ define i32 @test_v16i32(<16 x i32> %a0) {
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm4, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm4, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm5, %xmm2
; SSE2-NEXT: pmuludq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: pmuludq %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
-; SSE2-NEXT: pmuludq %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
@@ -1000,64 +974,34 @@ define i32 @test_v16i32(<16 x i32> %a0) {
define i32 @test_v32i32(<32 x i32> %a0) {
; SSE2-LABEL: test_v32i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm8, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm8, %xmm10
+; SSE2-NEXT: pmuludq %xmm9, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm8, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm8, %xmm11
+; SSE2-NEXT: pmuludq %xmm9, %xmm11
+; SSE2-NEXT: pmuludq %xmm10, %xmm11
; SSE2-NEXT: pmuludq %xmm6, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm8, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm8, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; SSE2-NEXT: pmuludq %xmm9, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
; SSE2-NEXT: pmuludq %xmm7, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
; SSE2-NEXT: pmuludq %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2]
-; SSE2-NEXT: pmuludq %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
-; SSE2-NEXT: pmuludq %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT: pmuludq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,2,2]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
-; SSE2-NEXT: pmuludq %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT: pmuludq %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE2-NEXT: pmuludq %xmm2, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3]
+; SSE2-NEXT: pmuludq %xmm11, %xmm1
+; SSE2-NEXT: pmuludq %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v32i32:
diff --git a/llvm/test/CodeGen/X86/vector-reduce-smax.ll b/llvm/test/CodeGen/X86/vector-reduce-smax.ll
index 95aea6f524023..4ef13e935d8c9 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-smax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-smax.ll
@@ -45,10 +45,9 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
; SSE41-NEXT: pmovsxdq %xmm4, %xmm5
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movq %xmm2, %rax
; SSE41-NEXT: retq
@@ -138,10 +137,9 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -151,10 +149,9 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
; SSE41-NEXT: pmovsxdq %xmm4, %xmm5
; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movq %xmm2, %rax
; SSE41-NEXT: retq
@@ -298,10 +295,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT: pand %xmm8, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
@@ -311,10 +307,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE41-NEXT: movapd %xmm3, %xmm0
; SSE41-NEXT: xorpd %xmm5, %xmm0
@@ -324,10 +319,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; SSE41-NEXT: movdqa %xmm3, %xmm0
@@ -337,10 +331,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm5, %xmm2
; SSE41-NEXT: pmovsxdq %xmm2, %xmm4
; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: pand %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: movq %xmm1, %rax
; SSE41-NEXT: retq
@@ -561,10 +554,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm11
; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
-; SSE41-NEXT: pand %xmm12, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,3,3]
-; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
+; SSE41-NEXT: pand %xmm12, %xmm0
+; SSE41-NEXT: por %xmm11, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
; SSE41-NEXT: movdqa %xmm7, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
@@ -574,10 +566,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm11, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm11, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
@@ -587,10 +578,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm10, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm10, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
; SSE41-NEXT: movdqa %xmm6, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
@@ -600,10 +590,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm8, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6
; SSE41-NEXT: movapd %xmm6, %xmm0
; SSE41-NEXT: xorpd %xmm9, %xmm0
@@ -613,10 +602,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm3, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6
; SSE41-NEXT: movapd %xmm7, %xmm0
; SSE41-NEXT: xorpd %xmm9, %xmm0
@@ -626,10 +614,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm3, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7
; SSE41-NEXT: movapd %xmm7, %xmm0
; SSE41-NEXT: xorpd %xmm9, %xmm0
@@ -639,10 +626,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm3, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
; SSE41-NEXT: movdqa %xmm7, %xmm0
@@ -652,10 +638,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm9, %xmm2
; SSE41-NEXT: pmovsxdq %xmm2, %xmm3
; SSE41-NEXT: pcmpeqd %xmm0, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
-; SSE41-NEXT: pand %xmm3, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
; SSE41-NEXT: movq %xmm1, %rax
; SSE41-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-reduce-smin.ll b/llvm/test/CodeGen/X86/vector-reduce-smin.ll
index f86d3c1dd6e3f..b9a079b42621a 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-smin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-smin.ll
@@ -45,10 +45,9 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
; SSE41-NEXT: pmovsxdq %xmm4, %xmm5
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movq %xmm2, %rax
; SSE41-NEXT: retq
@@ -138,10 +137,9 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -151,10 +149,9 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
; SSE41-NEXT: pmovsxdq %xmm4, %xmm5
; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movq %xmm2, %rax
; SSE41-NEXT: retq
@@ -299,10 +296,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT: pand %xmm8, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
@@ -312,10 +308,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: xorpd %xmm5, %xmm0
@@ -325,10 +320,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; SSE41-NEXT: movdqa %xmm3, %xmm0
@@ -338,10 +332,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
; SSE41-NEXT: pmovsxdq %xmm2, %xmm4
; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: pand %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: movq %xmm1, %rax
; SSE41-NEXT: retq
@@ -562,10 +555,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm11
; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
-; SSE41-NEXT: pand %xmm12, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,3,3]
-; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
+; SSE41-NEXT: pand %xmm12, %xmm0
+; SSE41-NEXT: por %xmm11, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6
; SSE41-NEXT: movdqa %xmm8, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
@@ -575,10 +567,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT: pand %xmm11, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT: pand %xmm11, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
@@ -588,10 +579,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm8
; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT: pand %xmm10, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT: pand %xmm10, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
@@ -601,10 +591,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT: pand %xmm8, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
; SSE41-NEXT: movapd %xmm5, %xmm0
; SSE41-NEXT: xorpd %xmm9, %xmm0
@@ -614,10 +603,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm3, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7
; SSE41-NEXT: movapd %xmm4, %xmm0
; SSE41-NEXT: xorpd %xmm9, %xmm0
@@ -627,10 +615,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm3, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6
; SSE41-NEXT: movapd %xmm6, %xmm0
; SSE41-NEXT: xorpd %xmm9, %xmm0
@@ -640,10 +627,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm3, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
; SSE41-NEXT: movdqa %xmm7, %xmm0
@@ -653,10 +639,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
; SSE41-NEXT: pmovsxdq %xmm2, %xmm3
; SSE41-NEXT: pcmpeqd %xmm0, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
-; SSE41-NEXT: pand %xmm3, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
; SSE41-NEXT: movq %xmm1, %rax
; SSE41-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll
index 2f3c1e09ea78e..7f813675aa8af 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll
@@ -45,10 +45,9 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
; SSE41-NEXT: pmovsxdq %xmm4, %xmm5
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movq %xmm2, %rax
; SSE41-NEXT: retq
@@ -156,10 +155,9 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -169,10 +167,9 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
; SSE41-NEXT: pmovsxdq %xmm4, %xmm5
; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movq %xmm2, %rax
; SSE41-NEXT: retq
@@ -333,10 +330,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT: pand %xmm8, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
@@ -346,10 +342,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE41-NEXT: movapd %xmm3, %xmm0
; SSE41-NEXT: xorpd %xmm5, %xmm0
@@ -359,10 +354,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; SSE41-NEXT: movdqa %xmm3, %xmm0
@@ -372,10 +366,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm5, %xmm2
; SSE41-NEXT: pmovsxdq %xmm2, %xmm4
; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: pand %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: movq %xmm1, %rax
; SSE41-NEXT: retq
@@ -624,10 +617,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm11
; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
-; SSE41-NEXT: pand %xmm12, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,3,3]
-; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
+; SSE41-NEXT: pand %xmm12, %xmm0
+; SSE41-NEXT: por %xmm11, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
; SSE41-NEXT: movdqa %xmm7, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
@@ -637,10 +629,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm11, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm11, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
@@ -650,10 +641,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm10, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm10, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
; SSE41-NEXT: movdqa %xmm6, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
@@ -663,10 +653,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm8, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6
; SSE41-NEXT: movapd %xmm6, %xmm0
; SSE41-NEXT: xorpd %xmm9, %xmm0
@@ -676,10 +665,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm3, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6
; SSE41-NEXT: movapd %xmm7, %xmm0
; SSE41-NEXT: xorpd %xmm9, %xmm0
@@ -689,10 +677,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm3, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7
; SSE41-NEXT: movapd %xmm7, %xmm0
; SSE41-NEXT: xorpd %xmm9, %xmm0
@@ -702,10 +689,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm3, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
; SSE41-NEXT: movdqa %xmm7, %xmm0
@@ -715,10 +701,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm9, %xmm2
; SSE41-NEXT: pmovsxdq %xmm2, %xmm3
; SSE41-NEXT: pcmpeqd %xmm0, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
-; SSE41-NEXT: pand %xmm3, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
; SSE41-NEXT: movq %xmm1, %rax
; SSE41-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll
index dc8d0d53a91a1..8926d162514ac 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll
@@ -45,10 +45,9 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
; SSE41-NEXT: pmovsxdq %xmm4, %xmm5
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movq %xmm2, %rax
; SSE41-NEXT: retq
@@ -156,10 +155,9 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -169,10 +167,9 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
; SSE41-NEXT: pmovsxdq %xmm4, %xmm5
; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movq %xmm2, %rax
; SSE41-NEXT: retq
@@ -335,10 +332,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT: pand %xmm8, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
@@ -348,10 +344,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: xorpd %xmm5, %xmm0
@@ -361,10 +356,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; SSE41-NEXT: movdqa %xmm3, %xmm0
@@ -374,10 +368,9 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
; SSE41-NEXT: pmovsxdq %xmm2, %xmm4
; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: pand %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: movq %xmm1, %rax
; SSE41-NEXT: retq
@@ -628,10 +621,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm11
; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
-; SSE41-NEXT: pand %xmm12, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,3,3]
-; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
+; SSE41-NEXT: pand %xmm12, %xmm0
+; SSE41-NEXT: por %xmm11, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6
; SSE41-NEXT: movdqa %xmm8, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
@@ -641,10 +633,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT: pand %xmm11, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT: pand %xmm11, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
@@ -654,10 +645,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm8
; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT: pand %xmm10, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT: pand %xmm10, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
@@ -667,10 +657,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT: pand %xmm8, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
; SSE41-NEXT: movapd %xmm5, %xmm0
; SSE41-NEXT: xorpd %xmm9, %xmm0
@@ -680,10 +669,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm3, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7
; SSE41-NEXT: movapd %xmm4, %xmm0
; SSE41-NEXT: xorpd %xmm9, %xmm0
@@ -693,10 +681,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm3, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6
; SSE41-NEXT: movapd %xmm6, %xmm0
; SSE41-NEXT: xorpd %xmm9, %xmm0
@@ -706,10 +693,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm3, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
; SSE41-NEXT: movdqa %xmm7, %xmm0
@@ -719,10 +705,9 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
; SSE41-NEXT: pmovsxdq %xmm2, %xmm3
; SSE41-NEXT: pcmpeqd %xmm0, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
-; SSE41-NEXT: pand %xmm3, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
; SSE41-NEXT: movq %xmm1, %rax
; SSE41-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index 764b815f539ff..0bf5a8d6daeae 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -929,32 +929,18 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
}
define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
-; SSE2-LABEL: splatvar_rotate_v16i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: psllw %xmm1, %xmm2
-; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psllw %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatvar_rotate_v16i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE41-NEXT: psllw %xmm1, %xmm2
-; SSE41-NEXT: psrlw $8, %xmm2
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT: psllw %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: packuswb %xmm2, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: splatvar_rotate_v16i8:
+; SSE: # %bb.0:
+; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE-NEXT: psllw %xmm1, %xmm2
+; SSE-NEXT: psrlw $8, %xmm2
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: psllw %xmm1, %xmm0
+; SSE-NEXT: psrlw $8, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: splatvar_rotate_v16i8:
; AVX: # %bb.0:
@@ -995,10 +981,9 @@ define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
;
; X86-SSE2-LABEL: splatvar_rotate_v16i8:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: psllw %xmm1, %xmm2
; X86-SSE2-NEXT: psrlw $8, %xmm2
; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -1109,7 +1094,7 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,32,128,128]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,128,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1121,7 +1106,7 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
; SSE41-LABEL: constant_rotate_v4i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,32,128,128]
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,128,u]
; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1133,7 +1118,7 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
; AVX1-LABEL: constant_rotate_v4i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,32,128,128]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,128,u]
; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1173,7 +1158,7 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,64,128]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,32,128,128]
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,u,128,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index c112538f26131..5ae3e2f5d7621 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -895,10 +895,10 @@ define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: constant_rotate_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,32,128,128]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,128,u]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [512,512,2048,2048]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [512,u,2048,u]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [256,512,1024,2048]
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
index 9cb1a982519c7..3085c325e0968 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -989,7 +989,7 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,32,128,128]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,128,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
@@ -1034,7 +1034,7 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,64,128]
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,32,128,128]
+; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,u,128,u]
; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index a7313fb35ee17..7a08f3ef116bd 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -387,9 +387,9 @@ define void @PR39483() {
; X86-AVX1-NEXT: vmovups 64, %ymm1
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
; X86-AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[0,3],ymm2[4,5],ymm1[4,7]
-; X86-AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X86-AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,0],ymm2[2,0],ymm3[5,4],ymm2[6,4]
+; X86-AVX1-NEXT: vmovups 16, %xmm2
+; X86-AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; X86-AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0],ymm3[2,0],ymm2[5,4],ymm3[6,4]
; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[0,3],ymm2[6,4],ymm0[4,7]
; X86-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
@@ -427,9 +427,9 @@ define void @PR39483() {
; X64-AVX1-NEXT: vmovups 64, %ymm1
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
; X64-AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[0,3],ymm2[4,5],ymm1[4,7]
-; X64-AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X64-AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,0],ymm2[2,0],ymm3[5,4],ymm2[6,4]
+; X64-AVX1-NEXT: vmovups 16, %xmm2
+; X64-AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; X64-AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0],ymm3[2,0],ymm2[5,4],ymm3[6,4]
; X64-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[0,3],ymm2[6,4],ymm0[4,7]
; X64-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index f344b9c624c77..18d79b67ea5bc 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -780,7 +780,7 @@ define <16 x i8> @combine_shl_pshufb(<4 x i32> %a0) {
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSSE3-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,256,65536,65536]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSSE3-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,256,65536,65536]
+; SSSE3-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,u,65536,u]
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,5,6,7,4,9,10,11,8,12,13,14,15]
@@ -846,20 +846,23 @@ define <16 x i8> @constant_fold_pshufb_2() {
define i32 @mask_zzz3_v16i8(<16 x i8> %a0) {
; SSSE3-LABEL: mask_zzz3_v16i8:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[14,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: movd %xmm0, %eax
+; SSSE3-NEXT: andl $-16777216, %eax # imm = 0xFF000000
; SSSE3-NEXT: retq
;
; SSE41-LABEL: mask_zzz3_v16i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[14]
+; SSE41-NEXT: psllw $8, %xmm0
; SSE41-NEXT: pextrd $3, %xmm0, %eax
+; SSE41-NEXT: andl $-16777216, %eax # imm = 0xFF000000
; SSE41-NEXT: retq
;
; AVX-LABEL: mask_zzz3_v16i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[14]
+; AVX-NEXT: vpsllw $8, %xmm0, %xmm0
; AVX-NEXT: vpextrd $3, %xmm0, %eax
+; AVX-NEXT: andl $-16777216, %eax # imm = 0xFF000000
; AVX-NEXT: retq
%1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>)
%2 = bitcast <16 x i8> %1 to <4 x i32>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index e3a2080234251..ee9d8a55aeb3e 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -2854,16 +2854,13 @@ define <4 x float> @PR30264(<4 x float> %x) {
define <8 x i16> @PR39549(<16 x i8> %x) {
; SSE-LABEL: PR39549:
; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE-NEXT: psraw $8, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE-NEXT: psraw $8, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: PR39549:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX-NEXT: vpsraw $8, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 8, i32 undef, i32 9, i32 undef, i32 10, i32 undef, i32 11, i32 undef, i32 12, i32 undef, i32 13, i32 undef, i32 14, i32 undef, i32 15, i32 undef>
diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll
index 2f07166df813a..423537777a0e1 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-math.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll
@@ -2373,25 +2373,25 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [1,1,3,3]
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [1,u,3,u]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [5,5,7,7]
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [5,u,7,u]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [8,9,10,11]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [9,9,11,11]
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [9,u,11,u]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [12,13,14,15]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [13,13,15,15]
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [13,u,15,u]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
index 9481d9ae70471..48874e6e2ff27 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
@@ -61,10 +61,9 @@ define <2 x i32> @trunc_packus_v2i64_v2i32(<2 x i64> %a0) {
; SSE41-NEXT: pcmpeqd %xmm5, %xmm4
; SSE41-NEXT: pmovsxdq {{.*#+}} xmm6 = [2147483647,2147483647]
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: xorpd %xmm3, %xmm0
@@ -72,10 +71,9 @@ define <2 x i32> @trunc_packus_v2i64_v2i32(<2 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm3, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm4, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
; SSE41-NEXT: retq
@@ -182,10 +180,9 @@ define void @trunc_packus_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
; SSE41-NEXT: pcmpeqd %xmm5, %xmm4
; SSE41-NEXT: pmovsxdq {{.*#+}} xmm6 = [2147483647,2147483647]
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: xorpd %xmm3, %xmm0
@@ -193,10 +190,9 @@ define void @trunc_packus_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
; SSE41-NEXT: pcmpgtd %xmm3, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm4, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
; SSE41-NEXT: movq %xmm0, (%rdi)
@@ -330,10 +326,9 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) {
; SSE41-NEXT: pmovsxdq {{.*#+}} xmm7 = [2147483647,2147483647]
; SSE41-NEXT: movdqa %xmm7, %xmm8
; SSE41-NEXT: pcmpgtd %xmm0, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
; SSE41-NEXT: movapd %xmm5, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -341,10 +336,9 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) {
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm2, %xmm3
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
; SSE41-NEXT: movapd %xmm5, %xmm0
; SSE41-NEXT: xorpd %xmm4, %xmm0
@@ -352,10 +346,9 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm4, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm3, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
; SSE41-NEXT: movapd %xmm6, %xmm0
@@ -364,10 +357,9 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm4, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2
; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
; SSE41-NEXT: movaps %xmm2, %xmm0
@@ -604,10 +596,9 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25
; SSE41-NEXT: pmovsxdq {{.*#+}} xmm7 = [2147483647,2147483647]
; SSE41-NEXT: movdqa %xmm7, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm4
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4
; SSE41-NEXT: movdqa %xmm9, %xmm0
@@ -616,10 +607,9 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25
; SSE41-NEXT: pcmpeqd %xmm2, %xmm5
; SSE41-NEXT: movdqa %xmm7, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm5
; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm5
; SSE41-NEXT: movdqa %xmm8, %xmm0
@@ -628,10 +618,9 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25
; SSE41-NEXT: pcmpeqd %xmm2, %xmm9
; SSE41-NEXT: movdqa %xmm7, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm9, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm9
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm9
; SSE41-NEXT: movdqa %xmm6, %xmm0
@@ -639,10 +628,9 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25
; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm2, %xmm8
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm8, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: xorpd %xmm3, %xmm0
@@ -650,10 +638,9 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25
; SSE41-NEXT: pcmpgtd %xmm3, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6
; SSE41-NEXT: movapd %xmm9, %xmm0
@@ -662,10 +649,9 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25
; SSE41-NEXT: pcmpgtd %xmm3, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1
; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2]
@@ -675,10 +661,9 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25
; SSE41-NEXT: pcmpgtd %xmm3, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6
; SSE41-NEXT: movapd %xmm4, %xmm0
@@ -687,10 +672,9 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25
; SSE41-NEXT: pcmpgtd %xmm3, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2]
; SSE41-NEXT: movaps %xmm2, %xmm0
@@ -816,10 +800,9 @@ define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) {
; SSE41-NEXT: pcmpeqd %xmm5, %xmm4
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147549183,2147549183]
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: xorpd %xmm3, %xmm0
@@ -827,10 +810,9 @@ define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm3, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm4, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
@@ -962,10 +944,9 @@ define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
; SSE41-NEXT: pcmpeqd %xmm5, %xmm4
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147549183,2147549183]
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: xorpd %xmm3, %xmm0
@@ -973,10 +954,9 @@ define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
; SSE41-NEXT: pcmpgtd %xmm3, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm4, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
@@ -1140,10 +1120,9 @@ define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) {
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = [2147549183,2147549183]
; SSE41-NEXT: movdqa %xmm7, %xmm8
; SSE41-NEXT: pcmpgtd %xmm0, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
; SSE41-NEXT: movapd %xmm5, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -1151,10 +1130,9 @@ define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) {
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm2, %xmm3
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
; SSE41-NEXT: movapd %xmm5, %xmm0
; SSE41-NEXT: xorpd %xmm4, %xmm0
@@ -1162,10 +1140,9 @@ define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm4, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm3, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
; SSE41-NEXT: movapd %xmm6, %xmm0
@@ -1174,10 +1151,9 @@ define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm4, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2
; SSE41-NEXT: packusdw %xmm1, %xmm2
; SSE41-NEXT: packusdw %xmm2, %xmm2
@@ -1336,10 +1312,9 @@ define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = [2147549183,2147549183]
; SSE41-NEXT: movdqa %xmm7, %xmm8
; SSE41-NEXT: pcmpgtd %xmm0, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
; SSE41-NEXT: movapd %xmm5, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -1347,10 +1322,9 @@ define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm2
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
; SSE41-NEXT: movapd %xmm5, %xmm0
; SSE41-NEXT: xorpd %xmm4, %xmm0
@@ -1358,10 +1332,9 @@ define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
; SSE41-NEXT: pcmpgtd %xmm4, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm2, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
; SSE41-NEXT: movapd %xmm6, %xmm0
@@ -1370,10 +1343,9 @@ define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
; SSE41-NEXT: pcmpgtd %xmm4, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3
; SSE41-NEXT: packusdw %xmm1, %xmm3
; SSE41-NEXT: packusdw %xmm3, %xmm3
@@ -1593,10 +1565,9 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm8 = [2147549183,2147549183]
; SSE41-NEXT: movdqa %xmm8, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm4
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4
; SSE41-NEXT: movdqa %xmm9, %xmm0
@@ -1605,10 +1576,9 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25
; SSE41-NEXT: pcmpeqd %xmm2, %xmm5
; SSE41-NEXT: movdqa %xmm8, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm5
; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm5
; SSE41-NEXT: movdqa %xmm7, %xmm0
@@ -1617,10 +1587,9 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25
; SSE41-NEXT: pcmpeqd %xmm2, %xmm9
; SSE41-NEXT: movdqa %xmm8, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm9, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm9
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9
; SSE41-NEXT: movdqa %xmm6, %xmm0
@@ -1628,10 +1597,9 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm2, %xmm7
; SSE41-NEXT: pcmpgtd %xmm0, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: xorpd %xmm3, %xmm0
@@ -1639,10 +1607,9 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25
; SSE41-NEXT: pcmpgtd %xmm3, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6
; SSE41-NEXT: movapd %xmm9, %xmm0
@@ -1651,10 +1618,9 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25
; SSE41-NEXT: pcmpgtd %xmm3, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1
; SSE41-NEXT: packusdw %xmm6, %xmm1
@@ -1664,10 +1630,9 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25
; SSE41-NEXT: pcmpgtd %xmm3, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6
; SSE41-NEXT: movapd %xmm4, %xmm0
@@ -1676,10 +1641,9 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25
; SSE41-NEXT: pcmpgtd %xmm3, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: packusdw %xmm6, %xmm2
; SSE41-NEXT: packusdw %xmm2, %xmm1
@@ -2248,33 +2212,31 @@ define <2 x i8> @trunc_packus_v2i64_v2i8(<2 x i64> %a0) {
;
; SSE41-LABEL: trunc_packus_v2i64_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255]
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483648,2147483648]
; SSE41-NEXT: pxor %xmm4, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm5
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm5
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903]
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE41-NEXT: movapd %xmm3, %xmm0
; SSE41-NEXT: xorpd %xmm4, %xmm0
-; SSE41-NEXT: movapd %xmm0, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSE41-NEXT: movapd %xmm0, %xmm1
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: trunc_packus_v2i64_v2i8:
@@ -2407,31 +2369,29 @@ define void @trunc_packus_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255]
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm5
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648]
+; SSE41-NEXT: pxor %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE41-NEXT: pxor %xmm5, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm4
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903]
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movapd %xmm2, %xmm0
-; SSE41-NEXT: xorpd %xmm4, %xmm0
+; SSE41-NEXT: xorpd %xmm3, %xmm0
; SSE41-NEXT: movapd %xmm0, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSE41-NEXT: pextrw $0, %xmm3, (%rdi)
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
+; SSE41-NEXT: pshufb {{.*#+}} xmm5 = xmm5[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: pextrw $0, %xmm5, (%rdi)
; SSE41-NEXT: retq
;
; AVX-LABEL: trunc_packus_v2i64_v2i8_store:
@@ -2566,10 +2526,9 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) {
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = [2147483903,2147483903]
; SSE41-NEXT: movdqa %xmm7, %xmm8
; SSE41-NEXT: pcmpgtd %xmm0, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
; SSE41-NEXT: movapd %xmm5, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -2577,10 +2536,9 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) {
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm2, %xmm3
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
; SSE41-NEXT: movapd %xmm5, %xmm0
; SSE41-NEXT: xorpd %xmm4, %xmm0
@@ -2588,10 +2546,9 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm4, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm3, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
; SSE41-NEXT: movapd %xmm6, %xmm0
@@ -2600,10 +2557,9 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) {
; SSE41-NEXT: pcmpgtd %xmm4, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2
; SSE41-NEXT: packusdw %xmm1, %xmm2
; SSE41-NEXT: packusdw %xmm2, %xmm2
@@ -2766,10 +2722,9 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = [2147483903,2147483903]
; SSE41-NEXT: movdqa %xmm7, %xmm8
; SSE41-NEXT: pcmpgtd %xmm0, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
; SSE41-NEXT: movapd %xmm5, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -2777,10 +2732,9 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm2
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
; SSE41-NEXT: movapd %xmm5, %xmm0
; SSE41-NEXT: xorpd %xmm4, %xmm0
@@ -2788,10 +2742,9 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; SSE41-NEXT: pcmpgtd %xmm4, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm2, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
; SSE41-NEXT: movapd %xmm6, %xmm0
@@ -2800,10 +2753,9 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; SSE41-NEXT: pcmpgtd %xmm4, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3
; SSE41-NEXT: packusdw %xmm1, %xmm3
; SSE41-NEXT: packusdw %xmm3, %xmm3
@@ -3027,10 +2979,9 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256"
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm8 = [2147483903,2147483903]
; SSE41-NEXT: movdqa %xmm8, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm4
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4
; SSE41-NEXT: movdqa %xmm9, %xmm0
@@ -3039,10 +2990,9 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256"
; SSE41-NEXT: pcmpeqd %xmm2, %xmm5
; SSE41-NEXT: movdqa %xmm8, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm5
; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm5
; SSE41-NEXT: movdqa %xmm7, %xmm0
@@ -3051,10 +3001,9 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256"
; SSE41-NEXT: pcmpeqd %xmm2, %xmm9
; SSE41-NEXT: movdqa %xmm8, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm9, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm9
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9
; SSE41-NEXT: movdqa %xmm6, %xmm0
@@ -3062,10 +3011,9 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256"
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm2, %xmm7
; SSE41-NEXT: pcmpgtd %xmm0, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: xorpd %xmm3, %xmm0
@@ -3073,10 +3021,9 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256"
; SSE41-NEXT: pcmpgtd %xmm3, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6
; SSE41-NEXT: movapd %xmm9, %xmm0
@@ -3085,10 +3032,9 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256"
; SSE41-NEXT: pcmpgtd %xmm3, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1
; SSE41-NEXT: packusdw %xmm6, %xmm1
@@ -3098,10 +3044,9 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256"
; SSE41-NEXT: pcmpgtd %xmm3, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6
; SSE41-NEXT: movapd %xmm4, %xmm0
@@ -3110,10 +3055,9 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256"
; SSE41-NEXT: pcmpgtd %xmm3, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: packusdw %xmm6, %xmm2
; SSE41-NEXT: packusdw %xmm2, %xmm1
@@ -3329,10 +3273,9 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm8 = [2147483903,2147483903]
; SSE41-NEXT: movdqa %xmm8, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm5, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
; SSE41-NEXT: movdqa %xmm9, %xmm0
@@ -3341,10 +3284,9 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi
; SSE41-NEXT: pcmpeqd %xmm1, %xmm4
; SSE41-NEXT: movdqa %xmm8, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm5, %xmm4
; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm4
; SSE41-NEXT: movdqa %xmm7, %xmm0
@@ -3353,10 +3295,9 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi
; SSE41-NEXT: pcmpeqd %xmm1, %xmm9
; SSE41-NEXT: movdqa %xmm8, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm9, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm5, %xmm9
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9
; SSE41-NEXT: movdqa %xmm6, %xmm0
@@ -3364,10 +3305,9 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm1, %xmm7
; SSE41-NEXT: pcmpgtd %xmm0, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5
; SSE41-NEXT: movapd %xmm5, %xmm0
; SSE41-NEXT: xorpd %xmm2, %xmm0
@@ -3375,10 +3315,9 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi
; SSE41-NEXT: pcmpgtd %xmm2, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6
; SSE41-NEXT: movapd %xmm9, %xmm0
@@ -3387,10 +3326,9 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi
; SSE41-NEXT: pcmpgtd %xmm2, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm5
; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm5
; SSE41-NEXT: packusdw %xmm6, %xmm5
@@ -3400,10 +3338,9 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi
; SSE41-NEXT: pcmpgtd %xmm2, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6
; SSE41-NEXT: movapd %xmm3, %xmm0
@@ -3412,10 +3349,9 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi
; SSE41-NEXT: pcmpgtd %xmm2, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: packusdw %xmm6, %xmm1
; SSE41-NEXT: packusdw %xmm1, %xmm5
@@ -3741,10 +3677,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm10 = [2147483903,2147483903]
; SSE41-NEXT: movdqa %xmm10, %xmm14
; SSE41-NEXT: pcmpgtd %xmm0, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm15
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3]
-; SSE41-NEXT: por %xmm15, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm14, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm4
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4
; SSE41-NEXT: movdqa %xmm6, %xmm0
@@ -3753,10 +3688,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
; SSE41-NEXT: pcmpeqd %xmm2, %xmm5
; SSE41-NEXT: movdqa %xmm10, %xmm14
; SSE41-NEXT: pcmpgtd %xmm0, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm15
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3]
-; SSE41-NEXT: por %xmm15, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm14, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm5
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5
; SSE41-NEXT: movdqa %xmm7, %xmm0
@@ -3765,10 +3699,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
; SSE41-NEXT: pcmpeqd %xmm2, %xmm6
; SSE41-NEXT: movdqa %xmm10, %xmm14
; SSE41-NEXT: pcmpgtd %xmm0, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm15
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3]
-; SSE41-NEXT: por %xmm15, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,2,2]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm14, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm6
; SSE41-NEXT: movdqa %xmm11, %xmm0
@@ -3777,10 +3710,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
; SSE41-NEXT: pcmpeqd %xmm2, %xmm7
; SSE41-NEXT: movdqa %xmm10, %xmm14
; SSE41-NEXT: pcmpgtd %xmm0, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm15
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3]
-; SSE41-NEXT: por %xmm15, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm14, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm7
; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm7
; SSE41-NEXT: movdqa %xmm13, %xmm0
@@ -3789,10 +3721,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
; SSE41-NEXT: pcmpeqd %xmm2, %xmm11
; SSE41-NEXT: movdqa %xmm10, %xmm14
; SSE41-NEXT: pcmpgtd %xmm0, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
-; SSE41-NEXT: pand %xmm11, %xmm15
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3]
-; SSE41-NEXT: por %xmm15, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,2,2]
+; SSE41-NEXT: pand %xmm11, %xmm0
+; SSE41-NEXT: por %xmm14, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm11
; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm11
; SSE41-NEXT: movdqa %xmm12, %xmm0
@@ -3801,10 +3732,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
; SSE41-NEXT: pcmpeqd %xmm2, %xmm13
; SSE41-NEXT: movdqa %xmm10, %xmm14
; SSE41-NEXT: pcmpgtd %xmm0, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
-; SSE41-NEXT: pand %xmm13, %xmm15
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3]
-; SSE41-NEXT: por %xmm15, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,2,2]
+; SSE41-NEXT: pand %xmm13, %xmm0
+; SSE41-NEXT: por %xmm14, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm13
; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm13
; SSE41-NEXT: movdqa %xmm9, %xmm0
@@ -3813,10 +3743,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
; SSE41-NEXT: pcmpeqd %xmm2, %xmm12
; SSE41-NEXT: movdqa %xmm10, %xmm14
; SSE41-NEXT: pcmpgtd %xmm0, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
-; SSE41-NEXT: pand %xmm12, %xmm15
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3]
-; SSE41-NEXT: por %xmm15, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,2,2]
+; SSE41-NEXT: pand %xmm12, %xmm0
+; SSE41-NEXT: por %xmm14, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm12
; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm12
; SSE41-NEXT: movdqa %xmm8, %xmm0
@@ -3824,10 +3753,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm2, %xmm9
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm9, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm14, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: xorpd %xmm3, %xmm0
@@ -3835,10 +3763,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
; SSE41-NEXT: pcmpgtd %xmm3, %xmm8
; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm9, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
; SSE41-NEXT: pxor %xmm8, %xmm8
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8
; SSE41-NEXT: movapd %xmm12, %xmm0
@@ -3847,10 +3774,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
; SSE41-NEXT: pcmpgtd %xmm3, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm9, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm1
; SSE41-NEXT: packusdw %xmm8, %xmm1
@@ -3860,10 +3786,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
; SSE41-NEXT: pcmpgtd %xmm3, %xmm8
; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm9, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
; SSE41-NEXT: pxor %xmm8, %xmm8
; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm8
; SSE41-NEXT: movapd %xmm11, %xmm0
@@ -3872,10 +3797,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
; SSE41-NEXT: pcmpgtd %xmm3, %xmm9
; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm10, %xmm12
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
-; SSE41-NEXT: por %xmm12, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm10, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm9
; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm9
; SSE41-NEXT: packusdw %xmm8, %xmm9
@@ -3886,10 +3810,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
; SSE41-NEXT: pcmpgtd %xmm3, %xmm8
; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm9, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
; SSE41-NEXT: pxor %xmm8, %xmm8
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8
; SSE41-NEXT: movapd %xmm6, %xmm0
@@ -3898,10 +3821,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
; SSE41-NEXT: pcmpgtd %xmm3, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm9, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: pxor %xmm7, %xmm7
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7
; SSE41-NEXT: packusdw %xmm8, %xmm7
@@ -3911,10 +3833,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
; SSE41-NEXT: pcmpgtd %xmm3, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm8, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6
; SSE41-NEXT: movapd %xmm4, %xmm0
@@ -3923,10 +3844,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
; SSE41-NEXT: pcmpgtd %xmm3, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm8, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: packusdw %xmm6, %xmm2
; SSE41-NEXT: packusdw %xmm2, %xmm7
diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
index 5bfe2c1702880..d3c6b7538ac03 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
@@ -63,10 +63,9 @@ define <2 x i32> @trunc_ssat_v2i64_v2i32(<2 x i64> %a0) {
; SSE41-NEXT: pcmpeqd %xmm4, %xmm5
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [4294967295,0,4294967295,0]
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
; SSE41-NEXT: pxor %xmm2, %xmm3
@@ -74,10 +73,9 @@ define <2 x i32> @trunc_ssat_v2i64_v2i32(<2 x i64> %a0) {
; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SSE41-NEXT: retq
@@ -189,10 +187,9 @@ define void @trunc_ssat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
; SSE41-NEXT: pcmpeqd %xmm4, %xmm5
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [4294967295,0,4294967295,0]
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
; SSE41-NEXT: pxor %xmm2, %xmm3
@@ -200,10 +197,9 @@ define void @trunc_ssat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SSE41-NEXT: movq %xmm0, (%rdi)
@@ -347,10 +343,9 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) {
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm7 = [4294967295,0,4294967295,0]
; SSE41-NEXT: movdqa %xmm7, %xmm8
; SSE41-NEXT: pcmpgtd %xmm0, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
; SSE41-NEXT: movapd %xmm4, %xmm5
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -358,33 +353,30 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) {
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm6, %xmm2
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
-; SSE41-NEXT: movapd %xmm4, %xmm0
-; SSE41-NEXT: xorpd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm2
-; SSE41-NEXT: pmovsxbd {{.*#+}} xmm7 = [0,4294967295,0,4294967295]
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: movapd %xmm4, %xmm2
+; SSE41-NEXT: xorpd %xmm3, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm6
+; SSE41-NEXT: pmovsxbd {{.*#+}} xmm8 = [0,4294967295,0,4294967295]
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm2
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: xorpd %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; SSE41-NEXT: movaps %xmm1, %xmm0
@@ -624,10 +616,9 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256"
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm6 = [4294967295,0,4294967295,0]
; SSE41-NEXT: movdqa %xmm6, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm4
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4
; SSE41-NEXT: movdqa %xmm8, %xmm0
@@ -636,10 +627,9 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256"
; SSE41-NEXT: pcmpeqd %xmm9, %xmm5
; SSE41-NEXT: movdqa %xmm6, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm5
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5
; SSE41-NEXT: movdqa %xmm7, %xmm0
@@ -648,10 +638,9 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256"
; SSE41-NEXT: pcmpeqd %xmm9, %xmm8
; SSE41-NEXT: movdqa %xmm6, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm8, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm8
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8
; SSE41-NEXT: movdqa %xmm2, %xmm0
@@ -659,56 +648,51 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256"
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm9, %xmm7
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
-; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: xorpd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
+; SSE41-NEXT: movapd %xmm1, %xmm9
+; SSE41-NEXT: xorpd %xmm3, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm9
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm10
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm7 = [0,4294967295,0,4294967295]
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm9, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm7, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm10, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
; SSE41-NEXT: movapd %xmm2, %xmm9
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9
-; SSE41-NEXT: movapd %xmm8, %xmm0
-; SSE41-NEXT: xorpd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm1, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: movapd %xmm8, %xmm1
+; SSE41-NEXT: xorpd %xmm3, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm10
+; SSE41-NEXT: pcmpgtd %xmm7, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
+; SSE41-NEXT: pand %xmm10, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: movapd %xmm2, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1
; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2]
-; SSE41-NEXT: movapd %xmm5, %xmm0
-; SSE41-NEXT: xorpd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm8
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm8, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm5, %xmm8
+; SSE41-NEXT: xorpd %xmm3, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm7, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
; SSE41-NEXT: movapd %xmm2, %xmm8
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm8
; SSE41-NEXT: xorpd %xmm4, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm5
; SSE41-NEXT: pcmpgtd %xmm7, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm8[0,2]
; SSE41-NEXT: movaps %xmm2, %xmm0
@@ -835,10 +819,9 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) {
; SSE41-NEXT: pcmpeqd %xmm4, %xmm5
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147516415,2147516415]
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
; SSE41-NEXT: pxor %xmm2, %xmm3
@@ -846,10 +829,9 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) {
; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
@@ -972,10 +954,9 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
; SSE41-NEXT: pcmpeqd %xmm4, %xmm5
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147516415,2147516415]
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
; SSE41-NEXT: pxor %xmm2, %xmm3
@@ -983,10 +964,9 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
@@ -1144,10 +1124,9 @@ define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) {
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = [2147516415,2147516415]
; SSE41-NEXT: movdqa %xmm7, %xmm8
; SSE41-NEXT: pcmpgtd %xmm0, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
; SSE41-NEXT: movapd %xmm4, %xmm5
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -1155,33 +1134,30 @@ define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) {
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm6, %xmm2
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
-; SSE41-NEXT: movapd %xmm4, %xmm0
-; SSE41-NEXT: xorpd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200]
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: movapd %xmm4, %xmm2
+; SSE41-NEXT: xorpd %xmm3, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm6
+; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562035200,18446744071562035200]
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm2
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: xorpd %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
; SSE41-NEXT: packssdw %xmm2, %xmm1
; SSE41-NEXT: packssdw %xmm1, %xmm1
@@ -1334,10 +1310,9 @@ define void @trunc_ssat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = [2147516415,2147516415]
; SSE41-NEXT: movdqa %xmm7, %xmm8
; SSE41-NEXT: pcmpgtd %xmm0, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
; SSE41-NEXT: movapd %xmm4, %xmm5
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -1345,33 +1320,30 @@ define void @trunc_ssat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm6, %xmm2
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
-; SSE41-NEXT: movapd %xmm4, %xmm0
-; SSE41-NEXT: xorpd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200]
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: movapd %xmm4, %xmm2
+; SSE41-NEXT: xorpd %xmm3, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm6
+; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562035200,18446744071562035200]
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm2
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: xorpd %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
; SSE41-NEXT: packssdw %xmm2, %xmm1
; SSE41-NEXT: packssdw %xmm1, %xmm1
@@ -1587,10 +1559,9 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256"
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = [2147516415,2147516415]
; SSE41-NEXT: movdqa %xmm7, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
; SSE41-NEXT: movdqa %xmm8, %xmm0
@@ -1599,10 +1570,9 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256"
; SSE41-NEXT: pcmpeqd %xmm9, %xmm4
; SSE41-NEXT: movdqa %xmm7, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm4
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
; SSE41-NEXT: movdqa %xmm6, %xmm0
@@ -1611,10 +1581,9 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256"
; SSE41-NEXT: pcmpeqd %xmm9, %xmm8
; SSE41-NEXT: movdqa %xmm7, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm8, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm8
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8
; SSE41-NEXT: movdqa %xmm5, %xmm0
@@ -1622,56 +1591,51 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256"
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm9, %xmm6
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm5 = [18446744073709518848,18446744073709518848]
-; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: xorpd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
+; SSE41-NEXT: movapd %xmm1, %xmm9
+; SSE41-NEXT: xorpd %xmm2, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm9
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm10
; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200]
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm9, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm7, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm10, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
; SSE41-NEXT: movapd %xmm5, %xmm9
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9
-; SSE41-NEXT: movapd %xmm8, %xmm0
-; SSE41-NEXT: xorpd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm1, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: movapd %xmm8, %xmm1
+; SSE41-NEXT: xorpd %xmm2, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm10
+; SSE41-NEXT: pcmpgtd %xmm7, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
+; SSE41-NEXT: pand %xmm10, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: movapd %xmm5, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1
; SSE41-NEXT: packssdw %xmm9, %xmm1
-; SSE41-NEXT: movapd %xmm4, %xmm0
-; SSE41-NEXT: xorpd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm8
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm8, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm4, %xmm8
+; SSE41-NEXT: xorpd %xmm2, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm7, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
; SSE41-NEXT: movapd %xmm5, %xmm8
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8
; SSE41-NEXT: xorpd %xmm3, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm4
; SSE41-NEXT: pcmpgtd %xmm7, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
; SSE41-NEXT: packssdw %xmm8, %xmm5
; SSE41-NEXT: packssdw %xmm5, %xmm1
@@ -2016,10 +1980,9 @@ define <2 x i8> @trunc_ssat_v2i64_v2i8(<2 x i64> %a0) {
; SSE41-NEXT: pcmpeqd %xmm4, %xmm5
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483775,2147483775]
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
; SSE41-NEXT: pxor %xmm2, %xmm3
@@ -2027,10 +1990,9 @@ define <2 x i8> @trunc_ssat_v2i64_v2i8(<2 x i64> %a0) {
; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -2165,10 +2127,9 @@ define void @trunc_ssat_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) {
; SSE41-NEXT: pcmpeqd %xmm4, %xmm5
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483775,2147483775]
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
; SSE41-NEXT: pxor %xmm2, %xmm3
@@ -2176,10 +2137,9 @@ define void @trunc_ssat_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) {
; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE41-NEXT: pextrw $0, %xmm1, (%rdi)
@@ -2311,10 +2271,9 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) {
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = [2147483775,2147483775]
; SSE41-NEXT: movdqa %xmm7, %xmm8
; SSE41-NEXT: pcmpgtd %xmm0, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
; SSE41-NEXT: movapd %xmm4, %xmm5
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -2322,33 +2281,30 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) {
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm6, %xmm2
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
-; SSE41-NEXT: movapd %xmm4, %xmm0
-; SSE41-NEXT: xorpd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840]
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: movapd %xmm4, %xmm2
+; SSE41-NEXT: xorpd %xmm3, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm6
+; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067840,18446744071562067840]
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm2
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: xorpd %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
; SSE41-NEXT: packssdw %xmm2, %xmm1
; SSE41-NEXT: packssdw %xmm1, %xmm1
@@ -2505,10 +2461,9 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = [2147483775,2147483775]
; SSE41-NEXT: movdqa %xmm7, %xmm8
; SSE41-NEXT: pcmpgtd %xmm0, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3]
-; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
; SSE41-NEXT: movapd %xmm4, %xmm5
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -2516,33 +2471,30 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm6, %xmm2
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
-; SSE41-NEXT: movapd %xmm4, %xmm0
-; SSE41-NEXT: xorpd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840]
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm8, %xmm0
+; SSE41-NEXT: movapd %xmm4, %xmm2
+; SSE41-NEXT: xorpd %xmm3, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm6
+; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067840,18446744071562067840]
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm2
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: xorpd %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm7, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
; SSE41-NEXT: packssdw %xmm2, %xmm1
; SSE41-NEXT: packssdw %xmm1, %xmm1
@@ -2762,10 +2714,9 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" {
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = [2147483775,2147483775]
; SSE41-NEXT: movdqa %xmm7, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
; SSE41-NEXT: movdqa %xmm8, %xmm0
@@ -2774,10 +2725,9 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" {
; SSE41-NEXT: pcmpeqd %xmm9, %xmm4
; SSE41-NEXT: movdqa %xmm7, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm4
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
; SSE41-NEXT: movdqa %xmm6, %xmm0
@@ -2786,10 +2736,9 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" {
; SSE41-NEXT: pcmpeqd %xmm9, %xmm8
; SSE41-NEXT: movdqa %xmm7, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm8, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm8
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8
; SSE41-NEXT: movdqa %xmm5, %xmm0
@@ -2797,56 +2746,51 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" {
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm9, %xmm6
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm5 = [18446744073709551488,18446744073709551488]
-; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: xorpd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
+; SSE41-NEXT: movapd %xmm1, %xmm9
+; SSE41-NEXT: xorpd %xmm2, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm9
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm10
; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840]
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm9, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm7, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm10, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
; SSE41-NEXT: movapd %xmm5, %xmm9
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9
-; SSE41-NEXT: movapd %xmm8, %xmm0
-; SSE41-NEXT: xorpd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm1, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: movapd %xmm8, %xmm1
+; SSE41-NEXT: xorpd %xmm2, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm10
+; SSE41-NEXT: pcmpgtd %xmm7, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
+; SSE41-NEXT: pand %xmm10, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: movapd %xmm5, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1
; SSE41-NEXT: packssdw %xmm9, %xmm1
-; SSE41-NEXT: movapd %xmm4, %xmm0
-; SSE41-NEXT: xorpd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm8
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm8, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm4, %xmm8
+; SSE41-NEXT: xorpd %xmm2, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm7, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
; SSE41-NEXT: movapd %xmm5, %xmm8
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8
; SSE41-NEXT: xorpd %xmm3, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm4
; SSE41-NEXT: pcmpgtd %xmm7, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
; SSE41-NEXT: packssdw %xmm8, %xmm5
; SSE41-NEXT: packssdw %xmm5, %xmm1
@@ -3068,10 +3012,9 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = [2147483775,2147483775]
; SSE41-NEXT: movdqa %xmm7, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm4, %xmm2
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
; SSE41-NEXT: movdqa %xmm8, %xmm0
@@ -3080,10 +3023,9 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt
; SSE41-NEXT: pcmpeqd %xmm9, %xmm3
; SSE41-NEXT: movdqa %xmm7, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm4, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3
; SSE41-NEXT: movdqa %xmm6, %xmm0
@@ -3092,10 +3034,9 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt
; SSE41-NEXT: pcmpeqd %xmm9, %xmm8
; SSE41-NEXT: movdqa %xmm7, %xmm10
; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm8, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm4, %xmm8
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8
; SSE41-NEXT: movdqa %xmm5, %xmm0
@@ -3103,56 +3044,51 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm9, %xmm6
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4
; SSE41-NEXT: movapd {{.*#+}} xmm5 = [18446744073709551488,18446744073709551488]
-; SSE41-NEXT: movapd %xmm4, %xmm0
-; SSE41-NEXT: xorpd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
+; SSE41-NEXT: movapd %xmm4, %xmm9
+; SSE41-NEXT: xorpd %xmm1, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm9
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm10
; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840]
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm9, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm7, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm10, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
; SSE41-NEXT: movapd %xmm5, %xmm9
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9
-; SSE41-NEXT: movapd %xmm8, %xmm0
-; SSE41-NEXT: xorpd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: movapd %xmm8, %xmm4
+; SSE41-NEXT: xorpd %xmm1, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm10
+; SSE41-NEXT: pcmpgtd %xmm7, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
+; SSE41-NEXT: pand %xmm10, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: movapd %xmm5, %xmm4
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
; SSE41-NEXT: packssdw %xmm9, %xmm4
-; SSE41-NEXT: movapd %xmm3, %xmm0
-; SSE41-NEXT: xorpd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm8
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm8, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm3, %xmm8
+; SSE41-NEXT: xorpd %xmm1, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm7, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: por %xmm8, %xmm0
; SSE41-NEXT: movapd %xmm5, %xmm8
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm8
; SSE41-NEXT: xorpd %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm3
; SSE41-NEXT: pcmpgtd %xmm7, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
; SSE41-NEXT: packssdw %xmm8, %xmm5
; SSE41-NEXT: packssdw %xmm5, %xmm4
@@ -3486,10 +3422,9 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm9 = [2147483775,2147483775]
; SSE41-NEXT: movdqa %xmm9, %xmm14
; SSE41-NEXT: pcmpgtd %xmm0, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm15
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3]
-; SSE41-NEXT: por %xmm15, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,2,2]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm14, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
; SSE41-NEXT: movdqa %xmm5, %xmm0
@@ -3498,10 +3433,9 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256
; SSE41-NEXT: pcmpeqd %xmm11, %xmm4
; SSE41-NEXT: movdqa %xmm9, %xmm14
; SSE41-NEXT: pcmpgtd %xmm0, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm15
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3]
-; SSE41-NEXT: por %xmm15, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm14, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm4
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4
; SSE41-NEXT: movdqa %xmm6, %xmm0
@@ -3510,10 +3444,9 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256
; SSE41-NEXT: pcmpeqd %xmm11, %xmm5
; SSE41-NEXT: movdqa %xmm9, %xmm14
; SSE41-NEXT: pcmpgtd %xmm0, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm15
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3]
-; SSE41-NEXT: por %xmm15, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm14, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm5
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5
; SSE41-NEXT: movdqa %xmm10, %xmm0
@@ -3522,10 +3455,9 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256
; SSE41-NEXT: pcmpeqd %xmm11, %xmm6
; SSE41-NEXT: movdqa %xmm9, %xmm14
; SSE41-NEXT: pcmpgtd %xmm0, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm15
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3]
-; SSE41-NEXT: por %xmm15, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,2,2]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm14, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm6
; SSE41-NEXT: movdqa %xmm13, %xmm0
@@ -3534,10 +3466,9 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256
; SSE41-NEXT: pcmpeqd %xmm11, %xmm10
; SSE41-NEXT: movdqa %xmm9, %xmm14
; SSE41-NEXT: pcmpgtd %xmm0, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
-; SSE41-NEXT: pand %xmm10, %xmm15
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3]
-; SSE41-NEXT: por %xmm15, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,2,2]
+; SSE41-NEXT: pand %xmm10, %xmm0
+; SSE41-NEXT: por %xmm14, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm10
; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm10
; SSE41-NEXT: movdqa %xmm12, %xmm0
@@ -3546,10 +3477,9 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256
; SSE41-NEXT: pcmpeqd %xmm11, %xmm13
; SSE41-NEXT: movdqa %xmm9, %xmm14
; SSE41-NEXT: pcmpgtd %xmm0, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
-; SSE41-NEXT: pand %xmm13, %xmm15
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3]
-; SSE41-NEXT: por %xmm15, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,2,2]
+; SSE41-NEXT: pand %xmm13, %xmm0
+; SSE41-NEXT: por %xmm14, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm13
; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm13
; SSE41-NEXT: movdqa %xmm8, %xmm0
@@ -3558,10 +3488,9 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256
; SSE41-NEXT: pcmpeqd %xmm11, %xmm12
; SSE41-NEXT: movdqa %xmm9, %xmm14
; SSE41-NEXT: pcmpgtd %xmm0, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2]
-; SSE41-NEXT: pand %xmm12, %xmm15
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,3,3]
-; SSE41-NEXT: por %xmm15, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,2,2]
+; SSE41-NEXT: pand %xmm12, %xmm0
+; SSE41-NEXT: por %xmm14, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm12
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm12
; SSE41-NEXT: movdqa %xmm7, %xmm0
@@ -3569,103 +3498,94 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256
; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm11, %xmm8
; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2]
-; SSE41-NEXT: pand %xmm8, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm7 = [18446744073709551488,18446744073709551488]
-; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: xorpd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
+; SSE41-NEXT: movapd %xmm1, %xmm11
+; SSE41-NEXT: xorpd %xmm2, %xmm11
+; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm11[1,1,3,3]
; SSE41-NEXT: pcmpeqd %xmm8, %xmm8
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm11
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm14
; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840]
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm11, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm14, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm11
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2]
+; SSE41-NEXT: pand %xmm14, %xmm0
+; SSE41-NEXT: por %xmm11, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm11
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm11
-; SSE41-NEXT: movapd %xmm12, %xmm0
-; SSE41-NEXT: xorpd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm1, %xmm14
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm14, %xmm0
+; SSE41-NEXT: movapd %xmm12, %xmm1
+; SSE41-NEXT: xorpd %xmm2, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm1[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm14
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
+; SSE41-NEXT: pand %xmm14, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm1
; SSE41-NEXT: packssdw %xmm11, %xmm1
-; SSE41-NEXT: movapd %xmm13, %xmm0
-; SSE41-NEXT: xorpd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm11
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm11, %xmm12
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm12, %xmm0
+; SSE41-NEXT: movapd %xmm13, %xmm11
+; SSE41-NEXT: xorpd %xmm2, %xmm11
+; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm12
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm11
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2]
+; SSE41-NEXT: pand %xmm12, %xmm0
+; SSE41-NEXT: por %xmm11, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm11
; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm11
-; SSE41-NEXT: movapd %xmm10, %xmm0
-; SSE41-NEXT: xorpd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm12
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm12, %xmm13
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm13, %xmm0
+; SSE41-NEXT: movapd %xmm10, %xmm12
+; SSE41-NEXT: xorpd %xmm2, %xmm12
+; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm12[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm13
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm12
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2]
+; SSE41-NEXT: pand %xmm13, %xmm0
+; SSE41-NEXT: por %xmm12, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm12
; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm12
; SSE41-NEXT: packssdw %xmm11, %xmm12
; SSE41-NEXT: packssdw %xmm12, %xmm1
-; SSE41-NEXT: movapd %xmm6, %xmm0
-; SSE41-NEXT: xorpd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm10
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm10, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: movapd %xmm6, %xmm10
+; SSE41-NEXT: xorpd %xmm2, %xmm10
+; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm11
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
+; SSE41-NEXT: pand %xmm11, %xmm0
+; SSE41-NEXT: por %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm10
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm10
-; SSE41-NEXT: movapd %xmm5, %xmm0
-; SSE41-NEXT: xorpd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm11
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: movapd %xmm5, %xmm6
+; SSE41-NEXT: xorpd %xmm2, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm11
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm11, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6
; SSE41-NEXT: packssdw %xmm10, %xmm6
-; SSE41-NEXT: movapd %xmm4, %xmm0
-; SSE41-NEXT: xorpd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: por %xmm10, %xmm0
+; SSE41-NEXT: movapd %xmm4, %xmm5
+; SSE41-NEXT: xorpd %xmm2, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm10
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
+; SSE41-NEXT: pand %xmm10, %xmm0
+; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm5
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5
; SSE41-NEXT: xorpd %xmm3, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE41-NEXT: pcmpeqd %xmm8, %xmm4
; SSE41-NEXT: pcmpgtd %xmm9, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
; SSE41-NEXT: packssdw %xmm5, %xmm7
; SSE41-NEXT: packssdw %xmm7, %xmm6
diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
index 9c82aad5fa6a3..79cdaca56d46f 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
@@ -40,11 +40,10 @@ define <2 x i32> @trunc_usat_v2i64_v2i32(<2 x i64> %a0) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: pandn %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
@@ -117,11 +116,10 @@ define void @trunc_usat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: pandn %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
@@ -209,21 +207,20 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647]
-; SSE41-NEXT: movdqa %xmm5, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm3
-; SSE41-NEXT: pand %xmm6, %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm5
+; SSE41-NEXT: pxor %xmm4, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm5
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483647,2147483647,2147483647,2147483647]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm3
+; SSE41-NEXT: pand %xmm5, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm5
+; SSE41-NEXT: pxor %xmm4, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm5, %xmm0
+; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295]
; SSE41-NEXT: movapd {{.*#+}} xmm5 = [4294967295,429496729]
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
@@ -404,54 +401,51 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(ptr %p0) {
;
; SSE41-LABEL: trunc_usat_v8i64_v8i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm3
+; SSE41-NEXT: movdqa (%rdi), %xmm4
; SSE41-NEXT: movdqa 16(%rdi), %xmm6
; SSE41-NEXT: movdqa 32(%rdi), %xmm7
; SSE41-NEXT: movdqa 48(%rdi), %xmm1
-; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295]
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647]
-; SSE41-NEXT: movdqa %xmm5, %xmm9
-; SSE41-NEXT: pcmpgtd %xmm8, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pand %xmm9, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm8
+; SSE41-NEXT: movapd {{.*#+}} xmm3 = [4294967295,4294967295]
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: movdqa %xmm1, %xmm8
+; SSE41-NEXT: pxor %xmm5, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm8
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: movapd %xmm3, %xmm8
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8
-; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa %xmm5, %xmm9
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pand %xmm9, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm7, %xmm1
+; SSE41-NEXT: pxor %xmm5, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT: pand %xmm1, %xmm0
+; SSE41-NEXT: movapd %xmm3, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2]
-; SSE41-NEXT: movdqa %xmm6, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa %xmm5, %xmm8
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pand %xmm8, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm7
+; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: pxor %xmm5, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm7
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm0
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: movapd %xmm3, %xmm7
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2]
-; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: pand %xmm5, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm7[0,2]
+; SSE41-NEXT: movaps %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_usat_v8i64_v8i32:
@@ -544,11 +538,10 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: pandn %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
@@ -639,11 +632,10 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: pandn %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
@@ -754,31 +746,30 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) {
;
; SSE41-LABEL: trunc_usat_v4i64_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535]
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
-; SSE41-NEXT: movdqa %xmm6, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: movapd {{.*#+}} xmm3 = [65535,65535]
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: pxor %xmm5, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: packusdw %xmm5, %xmm2
-; SSE41-NEXT: packusdw %xmm2, %xmm2
+; SSE41-NEXT: movapd %xmm3, %xmm6
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6
; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT: pand %xmm5, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; SSE41-NEXT: packusdw %xmm6, %xmm3
+; SSE41-NEXT: packusdw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_usat_v4i64_v4i16:
@@ -889,30 +880,29 @@ define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
; SSE41-LABEL: trunc_usat_v4i64_v4i16_store:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movapd {{.*#+}} xmm3 = [65535,65535]
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
-; SSE41-NEXT: movdqa %xmm6, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT: movapd {{.*#+}} xmm4 = [65535,65535]
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: pxor %xmm5, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183]
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT: packusdw %xmm5, %xmm3
-; SSE41-NEXT: packusdw %xmm3, %xmm3
-; SSE41-NEXT: movq %xmm3, (%rdi)
+; SSE41-NEXT: movapd %xmm4, %xmm6
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT: pand %xmm5, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
+; SSE41-NEXT: packusdw %xmm6, %xmm4
+; SSE41-NEXT: packusdw %xmm4, %xmm4
+; SSE41-NEXT: movq %xmm4, (%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_usat_v4i64_v4i16_store:
@@ -1056,54 +1046,51 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) {
; SSE41-LABEL: trunc_usat_v8i64_v8i16:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa (%rdi), %xmm7
-; SSE41-NEXT: movdqa 16(%rdi), %xmm1
-; SSE41-NEXT: movdqa 32(%rdi), %xmm3
+; SSE41-NEXT: movdqa 16(%rdi), %xmm2
+; SSE41-NEXT: movdqa 32(%rdi), %xmm4
; SSE41-NEXT: movdqa 48(%rdi), %xmm6
-; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535]
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: movapd {{.*#+}} xmm3 = [65535,65535]
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: movdqa %xmm2, %xmm8
+; SSE41-NEXT: pxor %xmm5, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm8
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147549183,2147549183,2147549183,2147549183]
; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
-; SSE41-NEXT: movdqa %xmm5, %xmm9
-; SSE41-NEXT: pcmpgtd %xmm8, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pand %xmm9, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm8
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8
-; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa %xmm5, %xmm9
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pand %xmm9, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
-; SSE41-NEXT: packusdw %xmm8, %xmm1
-; SSE41-NEXT: movdqa %xmm6, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa %xmm5, %xmm8
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
; SSE41-NEXT: pand %xmm8, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm7
+; SSE41-NEXT: movapd %xmm3, %xmm8
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8
+; SSE41-NEXT: movdqa %xmm7, %xmm2
+; SSE41-NEXT: pxor %xmm5, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: movapd %xmm3, %xmm2
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm2
+; SSE41-NEXT: packusdw %xmm8, %xmm2
+; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: pxor %xmm5, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm7
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm0
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: movapd %xmm3, %xmm7
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: packusdw %xmm7, %xmm2
-; SSE41-NEXT: packusdw %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT: pand %xmm5, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: packusdw %xmm7, %xmm3
+; SSE41-NEXT: packusdw %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_usat_v8i64_v8i16:
@@ -1647,11 +1634,10 @@ define <2 x i8> @trunc_usat_v2i64_v2i8(<2 x i64> %a0) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: pandn %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
@@ -1743,11 +1729,10 @@ define void @trunc_usat_v2i64_v2i8_store(<2 x i64> %a0, ptr %p1) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: pandn %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
@@ -1837,27 +1822,26 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm3
; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255]
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903]
-; SSE41-NEXT: movdqa %xmm6, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: pxor %xmm5, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903]
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: movapd %xmm2, %xmm6
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT: pand %xmm5, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: packusdw %xmm5, %xmm2
+; SSE41-NEXT: packusdw %xmm6, %xmm2
; SSE41-NEXT: packusdw %xmm2, %xmm2
; SSE41-NEXT: packuswb %xmm2, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm0
@@ -1972,31 +1956,30 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; SSE41-LABEL: trunc_usat_v4i64_v4i8_store:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255]
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903]
-; SSE41-NEXT: movdqa %xmm6, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255]
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: pxor %xmm5, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903]
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm7, %xmm0
; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT: packusdw %xmm5, %xmm3
-; SSE41-NEXT: packusdw %xmm3, %xmm3
-; SSE41-NEXT: packuswb %xmm3, %xmm3
-; SSE41-NEXT: movd %xmm3, (%rdi)
+; SSE41-NEXT: movapd %xmm4, %xmm6
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT: pand %xmm5, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
+; SSE41-NEXT: packusdw %xmm6, %xmm4
+; SSE41-NEXT: packusdw %xmm4, %xmm4
+; SSE41-NEXT: packuswb %xmm4, %xmm4
+; SSE41-NEXT: movd %xmm4, (%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_usat_v4i64_v4i8_store:
@@ -2139,55 +2122,52 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(ptr %p0) {
; SSE41-LABEL: trunc_usat_v8i64_v8i8:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa (%rdi), %xmm7
-; SSE41-NEXT: movdqa 16(%rdi), %xmm1
-; SSE41-NEXT: movdqa 32(%rdi), %xmm3
+; SSE41-NEXT: movdqa 16(%rdi), %xmm2
+; SSE41-NEXT: movdqa 32(%rdi), %xmm4
; SSE41-NEXT: movdqa 48(%rdi), %xmm6
-; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255]
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255]
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: movdqa %xmm2, %xmm8
+; SSE41-NEXT: pxor %xmm5, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm8
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903]
; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
-; SSE41-NEXT: movdqa %xmm5, %xmm9
-; SSE41-NEXT: pcmpgtd %xmm8, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pand %xmm9, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm8
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8
-; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa %xmm5, %xmm9
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pand %xmm9, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
-; SSE41-NEXT: packusdw %xmm8, %xmm1
-; SSE41-NEXT: movdqa %xmm6, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa %xmm5, %xmm8
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
; SSE41-NEXT: pand %xmm8, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm7
+; SSE41-NEXT: movapd %xmm3, %xmm8
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8
+; SSE41-NEXT: movdqa %xmm7, %xmm2
+; SSE41-NEXT: pxor %xmm5, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: movapd %xmm3, %xmm2
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm2
+; SSE41-NEXT: packusdw %xmm8, %xmm2
+; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: pxor %xmm5, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm7
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm0
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: movapd %xmm3, %xmm7
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: packusdw %xmm7, %xmm2
-; SSE41-NEXT: packusdw %xmm2, %xmm1
-; SSE41-NEXT: packuswb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT: pand %xmm5, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: packusdw %xmm7, %xmm3
+; SSE41-NEXT: packusdw %xmm3, %xmm2
+; SSE41-NEXT: packuswb %xmm2, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_usat_v8i64_v8i8:
@@ -2325,52 +2305,49 @@ define void @trunc_usat_v8i64_v8i8_store(ptr %p0, ptr%p1) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa (%rdi), %xmm7
; SSE41-NEXT: movdqa 16(%rdi), %xmm6
-; SSE41-NEXT: movdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movdqa 32(%rdi), %xmm3
; SSE41-NEXT: movdqa 48(%rdi), %xmm5
-; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255]
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm6, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903]
-; SSE41-NEXT: movdqa %xmm4, %xmm9
-; SSE41-NEXT: pcmpgtd %xmm8, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pand %xmm9, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm8
+; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255]
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: movdqa %xmm6, %xmm8
+; SSE41-NEXT: pxor %xmm4, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm8
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: movapd %xmm2, %xmm8
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8
-; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa %xmm4, %xmm9
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pand %xmm9, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm6
+; SSE41-NEXT: movdqa %xmm7, %xmm6
+; SSE41-NEXT: pxor %xmm4, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm6
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: movapd %xmm2, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm6
; SSE41-NEXT: packusdw %xmm8, %xmm6
-; SSE41-NEXT: movdqa %xmm5, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa %xmm4, %xmm8
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pand %xmm8, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm7
+; SSE41-NEXT: movdqa %xmm5, %xmm7
+; SSE41-NEXT: pxor %xmm4, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm7
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm8, %xmm0
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: movapd %xmm2, %xmm7
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: packusdw %xmm7, %xmm1
-; SSE41-NEXT: packusdw %xmm1, %xmm6
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pxor %xmm4, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT: pand %xmm4, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: packusdw %xmm7, %xmm2
+; SSE41-NEXT: packusdw %xmm2, %xmm6
; SSE41-NEXT: packuswb %xmm6, %xmm6
; SSE41-NEXT: movq %xmm6, (%rsi)
; SSE41-NEXT: retq
@@ -2561,103 +2538,96 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) {
;
; SSE41-LABEL: trunc_usat_v16i64_v16i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa 96(%rdi), %xmm2
-; SSE41-NEXT: movdqa 112(%rdi), %xmm4
+; SSE41-NEXT: movdqa 96(%rdi), %xmm3
+; SSE41-NEXT: movdqa 112(%rdi), %xmm6
; SSE41-NEXT: movdqa 64(%rdi), %xmm7
; SSE41-NEXT: movdqa 80(%rdi), %xmm8
; SSE41-NEXT: movdqa (%rdi), %xmm11
-; SSE41-NEXT: movdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movdqa 16(%rdi), %xmm2
; SSE41-NEXT: movdqa 32(%rdi), %xmm9
; SSE41-NEXT: movdqa 48(%rdi), %xmm10
-; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255]
+; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255]
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: movdqa %xmm2, %xmm12
+; SSE41-NEXT: pxor %xmm5, %xmm12
+; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm12
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903]
; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903]
-; SSE41-NEXT: movdqa %xmm6, %xmm13
-; SSE41-NEXT: pcmpgtd %xmm12, %xmm13
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE41-NEXT: pand %xmm13, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm12
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm12
-; SSE41-NEXT: movdqa %xmm11, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa %xmm6, %xmm13
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm13
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE41-NEXT: pand %xmm13, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm1
-; SSE41-NEXT: packusdw %xmm12, %xmm1
-; SSE41-NEXT: movdqa %xmm10, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa %xmm6, %xmm12
-; SSE41-NEXT: pcmpgtd %xmm11, %xmm12
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm13, %xmm0
; SSE41-NEXT: pand %xmm12, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm11
+; SSE41-NEXT: movapd %xmm4, %xmm12
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm12
+; SSE41-NEXT: movdqa %xmm11, %xmm2
+; SSE41-NEXT: pxor %xmm5, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm13, %xmm0
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: movapd %xmm4, %xmm2
+; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm2
+; SSE41-NEXT: packusdw %xmm12, %xmm2
+; SSE41-NEXT: movdqa %xmm10, %xmm11
+; SSE41-NEXT: pxor %xmm5, %xmm11
+; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm11
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm12, %xmm0
+; SSE41-NEXT: pand %xmm11, %xmm0
+; SSE41-NEXT: movapd %xmm4, %xmm11
; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm11
-; SSE41-NEXT: movdqa %xmm9, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa %xmm6, %xmm12
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm12
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE41-NEXT: pand %xmm12, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm10
+; SSE41-NEXT: movdqa %xmm9, %xmm10
+; SSE41-NEXT: pxor %xmm5, %xmm10
+; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm10
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm12, %xmm0
+; SSE41-NEXT: pand %xmm10, %xmm0
+; SSE41-NEXT: movapd %xmm4, %xmm10
; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm10
; SSE41-NEXT: packusdw %xmm11, %xmm10
-; SSE41-NEXT: packusdw %xmm10, %xmm1
-; SSE41-NEXT: movdqa %xmm8, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa %xmm6, %xmm10
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE41-NEXT: pand %xmm10, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm9
+; SSE41-NEXT: packusdw %xmm10, %xmm2
+; SSE41-NEXT: movdqa %xmm8, %xmm9
+; SSE41-NEXT: pxor %xmm5, %xmm9
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm9
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm10, %xmm0
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm4, %xmm9
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm9
-; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa %xmm6, %xmm10
-; SSE41-NEXT: pcmpgtd %xmm8, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE41-NEXT: pand %xmm10, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm8
+; SSE41-NEXT: movdqa %xmm7, %xmm8
+; SSE41-NEXT: pxor %xmm5, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm8
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm10, %xmm0
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: movapd %xmm4, %xmm8
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8
; SSE41-NEXT: packusdw %xmm9, %xmm8
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa %xmm6, %xmm9
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE41-NEXT: pand %xmm9, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm7
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: pxor %xmm5, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm7
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: movapd %xmm4, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT: packusdw %xmm7, %xmm3
-; SSE41-NEXT: packusdw %xmm3, %xmm8
-; SSE41-NEXT: packuswb %xmm8, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT: pand %xmm5, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4
+; SSE41-NEXT: packusdw %xmm7, %xmm4
+; SSE41-NEXT: packusdw %xmm4, %xmm8
+; SSE41-NEXT: packuswb %xmm8, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_usat_v16i64_v16i8:
diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index dea2446ecc320..4837532aaf5c5 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -740,14 +740,24 @@ define i64 @vselect_any_extend_vector_inreg_crash(ptr %x) {
; SSE-NEXT: shll $15, %eax
; SSE-NEXT: retq
;
-; AVX-LABEL: vselect_any_extend_vector_inreg_crash:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: andl $1, %eax
-; AVX-NEXT: shll $15, %eax
-; AVX-NEXT: retq
+; AVX1-LABEL: vselect_any_extend_vector_inreg_crash:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: shll $15, %eax
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: vselect_any_extend_vector_inreg_crash:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [49,49,49,49]
+; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: shll $15, %eax
+; AVX2-NEXT: retq
0:
%1 = load <8 x i8>, ptr %x
%2 = icmp eq <8 x i8> %1, <i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49>
>From d544325c809890172321880a97d06a82c4c2b897 Mon Sep 17 00:00:00 2001
From: Bjorn Pettersson <bjorn.a.pettersson at ericsson.com>
Date: Fri, 26 Sep 2025 13:54:08 +0200
Subject: [PATCH 3/4] [SelectionDAG] Add DoNotPoisonEltMask to
SimplifyMultipleUseDemandedBits/VectorElts
Add DoNotPoisonEltMask to SimplifyMultipleUseDemandedBits and
SimplifyMultipleUseDemandedVectorElts.
Goal is to reduce amount of regressions after fix of #138513.
---
llvm/include/llvm/CodeGen/TargetLowering.h | 11 +
.../CodeGen/SelectionDAG/TargetLowering.cpp | 141 +-
llvm/lib/Target/X86/X86ISelLowering.cpp | 25 +-
.../AArch64/vecreduce-and-legalization.ll | 19 +-
.../AMDGPU/shufflevector.v3bf16.v2bf16.ll | 45 +-
.../AMDGPU/shufflevector.v3bf16.v3bf16.ll | 45 +-
.../AMDGPU/shufflevector.v3bf16.v4bf16.ll | 45 +-
.../AMDGPU/shufflevector.v3f16.v2f16.ll | 45 +-
.../AMDGPU/shufflevector.v3f16.v3f16.ll | 45 +-
.../AMDGPU/shufflevector.v3f16.v4f16.ll | 45 +-
llvm/test/CodeGen/Thumb2/mve-vst3.ll | 38 +-
llvm/test/CodeGen/X86/avx512fp16-mov.ll | 32 +-
llvm/test/CodeGen/X86/extractelement-load.ll | 9 +-
llvm/test/CodeGen/X86/half.ll | 6 +-
.../test/CodeGen/X86/ifma-combine-vpmadd52.ll | 24 +-
llvm/test/CodeGen/X86/movmsk-cmp.ll | 1 -
llvm/test/CodeGen/X86/pr107423.ll | 26 +-
.../vector-interleaved-load-i16-stride-7.ll | 1702 ++++++++---------
18 files changed, 1119 insertions(+), 1185 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e8e61623137d3..043cdec038f4b 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4231,6 +4231,16 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
/// More limited version of SimplifyDemandedBits that can be used to "look
/// through" ops that don't contribute to the DemandedBits/DemandedElts -
/// bitwise ops etc.
+ /// Vector elements that aren't demanded can be turned into poison unless the
+ /// corresponding bit in the \p DoNotPoisonEltMask is set.
+ SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits,
+ const APInt &DemandedElts,
+ const APInt &DoNotPoisonEltMask,
+ SelectionDAG &DAG,
+ unsigned Depth = 0) const;
+
+ /// Helper wrapper around SimplifyMultipleUseDemandedBits, with
+ /// DoNotPoisonEltMask being set to zero.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits,
const APInt &DemandedElts,
SelectionDAG &DAG,
@@ -4246,6 +4256,7 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
/// bits from only some vector elements.
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op,
const APInt &DemandedElts,
+ const APInt &DoNotPoisonEltMask,
SelectionDAG &DAG,
unsigned Depth = 0) const;
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index cd361b4086090..b787d13686c58 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -708,7 +708,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
// TODO: Under what circumstances can we create nodes? Constant folding?
SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
- SelectionDAG &DAG, unsigned Depth) const {
+ const APInt &DoNotPoisonEltMask, SelectionDAG &DAG, unsigned Depth) const {
EVT VT = Op.getValueType();
// Limit search depth.
@@ -742,43 +742,58 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
unsigned NumDstEltBits = DstVT.getScalarSizeInBits();
if (NumSrcEltBits == NumDstEltBits)
if (SDValue V = SimplifyMultipleUseDemandedBits(
- Src, DemandedBits, DemandedElts, DAG, Depth + 1))
+ Src, DemandedBits, DemandedElts, DoNotPoisonEltMask, DAG,
+ Depth + 1))
return DAG.getBitcast(DstVT, V);
+ // Bitcast from 'small element' src vector to 'large element' vector.
if (SrcVT.isVector() && (NumDstEltBits % NumSrcEltBits) == 0) {
unsigned Scale = NumDstEltBits / NumSrcEltBits;
unsigned NumSrcElts = SrcVT.getVectorNumElements();
APInt DemandedSrcBits = APInt::getZero(NumSrcEltBits);
+ APInt DemandedSrcElts = APInt::getZero(NumSrcElts);
for (unsigned i = 0; i != Scale; ++i) {
unsigned EltOffset = IsLE ? i : (Scale - 1 - i);
unsigned BitOffset = EltOffset * NumSrcEltBits;
- DemandedSrcBits |= DemandedBits.extractBits(NumSrcEltBits, BitOffset);
+ APInt Sub = DemandedBits.extractBits(NumSrcEltBits, BitOffset);
+ if (!Sub.isZero()) {
+ DemandedSrcBits |= Sub;
+ for (unsigned j = 0; j != NumElts; ++j)
+ if (DemandedElts[j])
+ DemandedSrcElts.setBit((j * Scale) + i);
+ }
}
- // Recursive calls below may turn not demanded elements into poison, so we
- // need to demand all smaller source elements that maps to a demanded
- // destination element.
- APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
+ // Need to semi-demand all smaller source elements that maps to a demanded
+ // destination element or a destination element that must not be poisoned.
+ APInt DoNotPoisonSrcElts =
+ APIntOps::ScaleBitMask(DemandedElts | DoNotPoisonEltMask, NumSrcElts);
if (SDValue V = SimplifyMultipleUseDemandedBits(
- Src, DemandedSrcBits, DemandedSrcElts, DAG, Depth + 1))
+ Src, DemandedSrcBits, DemandedSrcElts, DoNotPoisonSrcElts, DAG,
+ Depth + 1))
return DAG.getBitcast(DstVT, V);
}
+ // Bitcast from 'large element' src vector to 'small element' vector.
// TODO - bigendian once we have test coverage.
if (IsLE && (NumSrcEltBits % NumDstEltBits) == 0) {
unsigned Scale = NumSrcEltBits / NumDstEltBits;
unsigned NumSrcElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
APInt DemandedSrcBits = APInt::getZero(NumSrcEltBits);
APInt DemandedSrcElts = APInt::getZero(NumSrcElts);
+ APInt DoNotPoisonSrcElts = APInt::getZero(NumSrcElts);
for (unsigned i = 0; i != NumElts; ++i)
if (DemandedElts[i]) {
unsigned Offset = (i % Scale) * NumDstEltBits;
DemandedSrcBits.insertBits(DemandedBits, Offset);
DemandedSrcElts.setBit(i / Scale);
+ } else if (DoNotPoisonEltMask[i]) {
+ DoNotPoisonSrcElts.setBit(i / Scale);
}
if (SDValue V = SimplifyMultipleUseDemandedBits(
- Src, DemandedSrcBits, DemandedSrcElts, DAG, Depth + 1))
+ Src, DemandedSrcBits, DemandedSrcElts, DoNotPoisonSrcElts, DAG,
+ Depth + 1))
return DAG.getBitcast(DstVT, V);
}
@@ -835,12 +850,12 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
case ISD::SHL: {
// If we are only demanding sign bits then we can use the shift source
// directly.
- if (std::optional<unsigned> MaxSA =
- DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) {
+ if (std::optional<unsigned> MaxSA = DAG.getValidMaximumShiftAmount(
+ Op, DemandedElts | DoNotPoisonEltMask, Depth + 1)) {
SDValue Op0 = Op.getOperand(0);
unsigned ShAmt = *MaxSA;
- unsigned NumSignBits =
- DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
+ unsigned NumSignBits = DAG.ComputeNumSignBits(
+ Op0, DemandedElts | DoNotPoisonEltMask, Depth + 1);
unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= (UpperDemandedBits))
return Op0;
@@ -850,15 +865,15 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
case ISD::SRL: {
// If we are only demanding sign bits then we can use the shift source
// directly.
- if (std::optional<unsigned> MaxSA =
- DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) {
+ if (std::optional<unsigned> MaxSA = DAG.getValidMaximumShiftAmount(
+ Op, DemandedElts | DoNotPoisonEltMask, Depth + 1)) {
SDValue Op0 = Op.getOperand(0);
unsigned ShAmt = *MaxSA;
// Must already be signbits in DemandedBits bounds, and can't demand any
// shifted in zeroes.
if (DemandedBits.countl_zero() >= ShAmt) {
- unsigned NumSignBits =
- DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
+ unsigned NumSignBits = DAG.ComputeNumSignBits(
+ Op0, DemandedElts | DoNotPoisonEltMask, Depth + 1);
if (DemandedBits.countr_zero() >= (BitWidth - NumSignBits))
return Op0;
}
@@ -895,7 +910,9 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
shouldRemoveRedundantExtend(Op))
return Op0;
// If the input is already sign extended, just drop the extension.
- unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
+ // FIXME: Can we skip DoNotPoisonEltMask here?
+ unsigned NumSignBits = DAG.ComputeNumSignBits(
+ Op0, DemandedElts | DoNotPoisonEltMask, Depth + 1);
if (NumSignBits >= (BitWidth - ExBits + 1))
return Op0;
break;
@@ -911,7 +928,8 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
SDValue Src = Op.getOperand(0);
EVT SrcVT = Src.getValueType();
EVT DstVT = Op.getValueType();
- if (IsLE && DemandedElts == 1 &&
+ // FIXME: Can we skip DoNotPoisonEltMask here?
+ if (IsLE && (DemandedElts | DoNotPoisonEltMask) == 1 &&
DstVT.getSizeInBits() == SrcVT.getSizeInBits() &&
DemandedBits.getActiveBits() <= SrcVT.getScalarSizeInBits()) {
return DAG.getBitcast(DstVT, Src);
@@ -926,8 +944,10 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
SDValue Vec = Op.getOperand(0);
auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
EVT VecVT = Vec.getValueType();
+ // FIXME: Handle DoNotPoisonEltMask better.
if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
- !DemandedElts[CIdx->getZExtValue()])
+ !DemandedElts[CIdx->getZExtValue()] &&
+ !DoNotPoisonEltMask[CIdx->getZExtValue()])
return Vec;
break;
}
@@ -940,8 +960,10 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
uint64_t Idx = Op.getConstantOperandVal(2);
unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
+ APInt DoNotPoisonSubElts = DoNotPoisonEltMask.extractBits(NumSubElts, Idx);
// If we don't demand the inserted subvector, return the base vector.
- if (DemandedSubElts == 0)
+ // FIXME: Handle DoNotPoisonEltMask better.
+ if (DemandedSubElts == 0 && DoNotPoisonSubElts == 0)
return Vec;
break;
}
@@ -954,9 +976,10 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
bool AllUndef = true, IdentityLHS = true, IdentityRHS = true;
for (unsigned i = 0; i != NumElts; ++i) {
int M = ShuffleMask[i];
- if (M < 0 || !DemandedElts[i])
+ if (M < 0 || (!DemandedElts[i] && !DoNotPoisonEltMask[i]))
continue;
- AllUndef = false;
+ if (DemandedElts[i])
+ AllUndef = false;
IdentityLHS &= (M == (int)i);
IdentityRHS &= ((M - NumElts) == i);
}
@@ -977,13 +1000,21 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
if (Op.getOpcode() >= ISD::BUILTIN_OP_END)
if (SDValue V = SimplifyMultipleUseDemandedBitsForTargetNode(
- Op, DemandedBits, DemandedElts, DAG, Depth))
+ Op, DemandedBits, DemandedElts | DoNotPoisonEltMask, DAG, Depth))
return V;
break;
}
return SDValue();
}
+SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
+ SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+ SelectionDAG &DAG, unsigned Depth) const {
+ APInt DoNotPoisonEltMask = APInt::getZero(DemandedElts.getBitWidth());
+ return SimplifyMultipleUseDemandedBits(Op, DemandedBits, DemandedElts,
+ DoNotPoisonEltMask, DAG, Depth);
+}
+
SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
SDValue Op, const APInt &DemandedBits, SelectionDAG &DAG,
unsigned Depth) const {
@@ -994,15 +1025,17 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
APInt DemandedElts = VT.isFixedLengthVector()
? APInt::getAllOnes(VT.getVectorNumElements())
: APInt(1, 1);
- return SimplifyMultipleUseDemandedBits(Op, DemandedBits, DemandedElts, DAG,
- Depth);
+ APInt DoNotPoisonEltMask = APInt::getZero(DemandedElts.getBitWidth());
+ return SimplifyMultipleUseDemandedBits(Op, DemandedBits, DemandedElts,
+ DoNotPoisonEltMask, DAG, Depth);
}
SDValue TargetLowering::SimplifyMultipleUseDemandedVectorElts(
- SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG,
- unsigned Depth) const {
+ SDValue Op, const APInt &DemandedElts, const APInt &DoNotPoisonEltMask,
+ SelectionDAG &DAG, unsigned Depth) const {
APInt DemandedBits = APInt::getAllOnes(Op.getScalarValueSizeInBits());
- return SimplifyMultipleUseDemandedBits(Op, DemandedBits, DemandedElts, DAG,
+ return SimplifyMultipleUseDemandedBits(Op, DemandedBits, DemandedElts,
+ DoNotPoisonEltMask, DAG,
Depth);
}
@@ -2766,8 +2799,7 @@ bool TargetLowering::SimplifyDemandedBits(
TLO.DAG.getNode(ISD::SHL, dl, VT, Sign, ShAmt));
}
}
-
- // Bitcast from a vector using SimplifyDemanded Bits/VectorElts.
+ // Bitcast from 'small element' src vector to 'large element' vector.
// Demand the elt/bit if any of the original elts/bits are demanded.
if (SrcVT.isVector() && (BitWidth % NumSrcEltBits) == 0) {
unsigned Scale = BitWidth / NumSrcEltBits;
@@ -2804,7 +2836,17 @@ bool TargetLowering::SimplifyDemandedBits(
DemandedSrcElts | DoNotPoisonSrcElts,
KnownSrcBits, TLO, Depth + 1))
return true;
+
+ if (!DemandedSrcBits.isAllOnes() || !DemandedSrcElts.isAllOnes()) {
+ if (SDValue DemandedSrc = SimplifyMultipleUseDemandedBits(
+ Src, DemandedSrcBits, DemandedSrcElts, DoNotPoisonSrcElts,
+ TLO.DAG, Depth + 1)) {
+ SDValue NewOp = TLO.DAG.getBitcast(VT, DemandedSrc);
+ return TLO.CombineTo(Op, NewOp);
+ }
+ }
} else if (IsLE && (NumSrcEltBits % BitWidth) == 0) {
+ // Bitcast from 'large element' src vector to 'small element' vector.
// TODO - bigendian once we have test coverage.
unsigned Scale = NumSrcEltBits / BitWidth;
unsigned NumSrcElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
@@ -3237,9 +3279,9 @@ bool TargetLowering::SimplifyDemandedVectorElts(
// operands.
auto SimplifyDemandedVectorEltsBinOp = [&](SDValue Op0, SDValue Op1) {
SDValue NewOp0 = SimplifyMultipleUseDemandedVectorElts(
- Op0, DemandedEltsInclDoNotPoison, TLO.DAG, Depth + 1);
+ Op0, DemandedElts, DoNotPoisonEltMask, TLO.DAG, Depth + 1);
SDValue NewOp1 = SimplifyMultipleUseDemandedVectorElts(
- Op1, DemandedEltsInclDoNotPoison, TLO.DAG, Depth + 1);
+ Op1, DemandedElts, DoNotPoisonEltMask, TLO.DAG, Depth + 1);
if (NewOp0 || NewOp1) {
SDValue NewOp =
TLO.DAG.getNode(Opcode, SDLoc(Op), VT, NewOp0 ? NewOp0 : Op0,
@@ -3442,12 +3484,11 @@ bool TargetLowering::SimplifyDemandedVectorElts(
for (unsigned i = 0; i != NumSubVecs; ++i) {
SDValue SubOp = Op.getOperand(i);
APInt SubElts = DemandedElts.extractBits(NumSubElts, i * NumSubElts);
+ APInt DoNotPoisonSubElts =
+ DoNotPoisonEltMask.extractBits(NumSubElts, i * NumSubElts);
if (SubElts != 0) {
- APInt SubEltsInclDoNotPoison =
- DemandedEltsInclDoNotPoison.extractBits(NumSubElts,
- i * NumSubElts);
SDValue NewSubOp = SimplifyMultipleUseDemandedVectorElts(
- SubOp, SubEltsInclDoNotPoison, TLO.DAG, Depth + 1);
+ SubOp, SubElts, DoNotPoisonSubElts, TLO.DAG, Depth + 1);
DemandedSubOps.push_back(NewSubOp ? NewSubOp : SubOp);
FoundNewSub = NewSubOp ? true : FoundNewSub;
} else if (!SubOp.isUndef()) {
@@ -3509,9 +3550,9 @@ bool TargetLowering::SimplifyDemandedVectorElts(
// Attempt to avoid multi-use ops if we don't need anything from them.
if (!DemandedSrcElts.isAllOnes() || !DemandedSubElts.isAllOnes()) {
SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
- Src, DemandedSrcElts | DoNoPoisonSrcElts, TLO.DAG, Depth + 1);
+ Src, DemandedSrcElts, DoNoPoisonSrcElts, TLO.DAG, Depth + 1);
SDValue NewSub = SimplifyMultipleUseDemandedVectorElts(
- Sub, DemandedSubElts | DoNoPoisonSubElts, TLO.DAG, Depth + 1);
+ Sub, DemandedSubElts, DoNoPoisonSubElts, TLO.DAG, Depth + 1);
if (NewSrc || NewSub) {
NewSrc = NewSrc ? NewSrc : Src;
NewSub = NewSub ? NewSub : Sub;
@@ -3530,15 +3571,11 @@ bool TargetLowering::SimplifyDemandedVectorElts(
uint64_t Idx = Op.getConstantOperandVal(1);
unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts).shl(Idx);
- APInt DoNotPoisonDemandedSrcElts =
- DoNotPoisonEltMask.zext(NumSrcElts).shl(Idx);
- APInt DemandedSrcEltsInclDoNotPoison =
- DemandedEltsInclDoNotPoison.zext(NumSrcElts).shl(Idx);
+ APInt DoNotPoisonSrcElts = DoNotPoisonEltMask.zext(NumSrcElts).shl(Idx);
APInt SrcUndef, SrcZero;
- if (SimplifyDemandedVectorElts(Src, DemandedSrcElts,
- DoNotPoisonDemandedSrcElts, SrcUndef,
- SrcZero, TLO, Depth + 1))
+ if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, DoNotPoisonSrcElts,
+ SrcUndef, SrcZero, TLO, Depth + 1))
return true;
KnownUndef = SrcUndef.extractBits(NumElts, Idx);
KnownZero = SrcZero.extractBits(NumElts, Idx);
@@ -3546,7 +3583,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
// Attempt to avoid multi-use ops if we don't need anything from them.
if (!DemandedElts.isAllOnes()) {
SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
- Src, DemandedSrcEltsInclDoNotPoison, TLO.DAG, Depth + 1);
+ Src, DemandedSrcElts, DoNotPoisonSrcElts, TLO.DAG, Depth + 1);
if (NewSrc) {
SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, NewSrc,
Op.getOperand(1));
@@ -3805,7 +3842,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
// Attempt to avoid multi-use ops if we don't need anything from them.
// TODO - use KnownUndef to relax the demandedelts?
- if (!DemandedEltsInclDoNotPoison.isAllOnes())
+ if (!DemandedElts.isAllOnes())
if (SimplifyDemandedVectorEltsBinOp(Op0, Op1))
return true;
break;
@@ -3832,7 +3869,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
// Attempt to avoid multi-use ops if we don't need anything from them.
// TODO - use KnownUndef to relax the demandedelts?
- if (!DemandedEltsInclDoNotPoison.isAllOnes())
+ if (!DemandedElts.isAllOnes())
if (SimplifyDemandedVectorEltsBinOp(Op0, Op1))
return true;
break;
@@ -3873,7 +3910,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
KnownUndef &= ~KnownZero;
// Attempt to avoid multi-use ops if we don't need anything from them.
- if (!DemandedEltsInclDoNotPoison.isAllOnes())
+ if (!DemandedElts.isAllOnes())
if (SimplifyDemandedVectorEltsBinOp(Op0, Op1))
return true;
break;
@@ -3886,9 +3923,9 @@ bool TargetLowering::SimplifyDemandedVectorElts(
TLO, Depth + 1))
return true;
- if (!DemandedEltsInclDoNotPoison.isAllOnes())
+ if (!DemandedElts.isAllOnes())
if (SDValue NewOp = SimplifyMultipleUseDemandedVectorElts(
- Op.getOperand(0), DemandedEltsInclDoNotPoison, TLO.DAG,
+ Op.getOperand(0), DemandedElts, DoNotPoisonEltMask, TLO.DAG,
Depth + 1))
return TLO.CombineTo(Op, TLO.DAG.getNode(Opcode, SDLoc(Op), VT, NewOp));
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f3db2953e2b86..219c1b672417e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41402,10 +41402,11 @@ static SDValue combineX86ShufflesRecursively(
// The Op itself may be of different VT, so we need to scale the mask.
unsigned NumOpElts = Op.getValueType().getVectorNumElements();
APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
+ APInt DoNotPoisonElts = APInt::getZero(NumOpElts);
// Can this operand be simplified any further, given it's demanded elements?
if (SDValue NewOp = TLI.SimplifyMultipleUseDemandedVectorElts(
- Op, OpScaledDemandedElts, DAG))
+ Op, OpScaledDemandedElts, DoNotPoisonElts, DAG))
Op = NewOp;
}
// FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
@@ -43418,12 +43419,13 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// Aggressively peek through ops to get at the demanded elts.
if (!DemandedElts.isAllOnes()) {
unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
- APInt DemandedSrcElts =
- APIntOps::ScaleBitMask(DemandedElts | DoNotPoisonEltMask, NumSrcElts);
+ APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
+ APInt DoNotPoisonSrcElts =
+ APIntOps::ScaleBitMask(DoNotPoisonEltMask, NumSrcElts);
SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts(
- LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
+ LHS, DemandedSrcElts, DoNotPoisonSrcElts, TLO.DAG, Depth + 1);
SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts(
- RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
+ RHS, DemandedSrcElts, DoNotPoisonSrcElts, TLO.DAG, Depth + 1);
if (NewLHS || NewRHS) {
NewLHS = NewLHS ? NewLHS : LHS;
NewRHS = NewRHS ? NewRHS : RHS;
@@ -43476,7 +43478,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// Aggressively peek through ops to get at the demanded elts.
if (!DemandedElts.isAllOnes())
if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
- Src, DemandedElts | DoNotPoisonEltMask, TLO.DAG, Depth + 1))
+ Src, DemandedElts, DoNotPoisonEltMask, TLO.DAG, Depth + 1))
return TLO.CombineTo(
Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
break;
@@ -43723,9 +43725,9 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// TODO - we should do this for all target/faux shuffles ops.
if (!DemandedElts.isAllOnes()) {
SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(
- N0, DemandedLHS | DoNotPoisonLHS, TLO.DAG, Depth + 1);
+ N0, DemandedLHS, DoNotPoisonLHS, TLO.DAG, Depth + 1);
SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(
- N1, DemandedRHS | DoNotPoisonRHS, TLO.DAG, Depth + 1);
+ N1, DemandedRHS, DoNotPoisonRHS, TLO.DAG, Depth + 1);
if (NewN0 || NewN1) {
NewN0 = NewN0 ? NewN0 : N0;
NewN1 = NewN1 ? NewN1 : N1;
@@ -43763,9 +43765,9 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// TODO: Handle repeated operands.
if (N0 != N1 && !DemandedElts.isAllOnes()) {
SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(
- N0, DemandedLHS | DoNotPoisonLHS, TLO.DAG, Depth + 1);
+ N0, DemandedLHS, DoNotPoisonLHS, TLO.DAG, Depth + 1);
SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(
- N1, DemandedRHS | DoNotPoisonRHS, TLO.DAG, Depth + 1);
+ N1, DemandedRHS, DoNotPoisonRHS, TLO.DAG, Depth + 1);
if (NewN0 || NewN1) {
NewN0 = NewN0 ? NewN0 : N0;
NewN1 = NewN1 ? NewN1 : N1;
@@ -43863,6 +43865,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
break;
APInt SrcUndef, SrcZero;
APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
+ // VBROADCAST only uses element zero. Allow poison in other elements.
APInt DoNotPoisonSrcElts = APInt::getZero(SrcVT.getVectorNumElements());
if (SimplifyDemandedVectorElts(Src, SrcElts, DoNotPoisonSrcElts, SrcUndef, SrcZero, TLO,
Depth + 1))
@@ -43870,7 +43873,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// Aggressively peek through src to get at the demanded elt.
// TODO - we should do this for all target/faux shuffles ops.
if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
- Src, SrcElts, TLO.DAG, Depth + 1))
+ Src, SrcElts, DoNotPoisonSrcElts, TLO.DAG, Depth + 1))
return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
break;
}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
index ac54dd41b0962..d2f16721e6e47 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
@@ -101,12 +101,13 @@ define i8 @test_v3i8(<3 x i8> %a) nounwind {
define i8 @test_v9i8(<9 x i8> %a) nounwind {
; CHECK-LABEL: test_v9i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.2d, #0xffffffffffffff00
+; CHECK-NEXT: movi v1.2d, #0xffffff00ffffff00
+; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: orr v1.16b, v0.16b, v1.16b
; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: and x8, x8, x8, lsr #32
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: and x8, x9, x8, lsr #32
; CHECK-NEXT: and x8, x8, x8, lsr #16
; CHECK-NEXT: lsr x9, x8, #8
; CHECK-NEXT: and w0, w8, w9
@@ -118,14 +119,12 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind {
define i32 @test_v3i32(<3 x i32> %a) nounwind {
; CHECK-LABEL: test_v3i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov v1.16b, v0.16b
-; CHECK-NEXT: mov w8, #-1 // =0xffffffff
-; CHECK-NEXT: mov v1.s[3], w8
-; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: lsr x9, x8, #32
-; CHECK-NEXT: and w0, w8, w9
+; CHECK-NEXT: lsr x8, x8, #32
+; CHECK-NEXT: and v1.8b, v0.8b, v1.8b
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: and w0, w9, w8
; CHECK-NEXT: ret
%b = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> %a)
ret i32 %b
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll
index 5914253b5f58e..8e24e581c7ea4 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll
@@ -985,9 +985,8 @@ define void @v_shuffle_v3bf16_v2bf16__u_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v1
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v0, v1, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -998,9 +997,8 @@ define void @v_shuffle_v3bf16_v2bf16__u_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v1
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v0, v1, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1011,9 +1009,8 @@ define void @v_shuffle_v3bf16_v2bf16__u_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v1
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x bfloat> asm "; def $0", "=v"()
@@ -1030,9 +1027,8 @@ define void @v_shuffle_v3bf16_v2bf16__0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v1
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v0, v1, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -1043,9 +1039,8 @@ define void @v_shuffle_v3bf16_v2bf16__0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v1
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v0, v1, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1056,9 +1051,8 @@ define void @v_shuffle_v3bf16_v2bf16__0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v1
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x bfloat> asm "; def $0", "=v"()
@@ -1126,9 +1120,8 @@ define void @v_shuffle_v3bf16_v2bf16__2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v1
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v0, v1, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -1139,9 +1132,8 @@ define void @v_shuffle_v3bf16_v2bf16__2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v1
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v0, v1, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1152,9 +1144,8 @@ define void @v_shuffle_v3bf16_v2bf16__2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v1
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x bfloat> asm "; def $0", "=v"()
@@ -1713,9 +1704,8 @@ define void @v_shuffle_v3bf16_v2bf16__u_3_3(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v1
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v0, v1, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -1726,9 +1716,8 @@ define void @v_shuffle_v3bf16_v2bf16__u_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v1
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v0, v1, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1739,9 +1728,8 @@ define void @v_shuffle_v3bf16_v2bf16__u_3_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v1
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x bfloat> asm "; def $0", "=v"()
@@ -1882,9 +1870,8 @@ define void @v_shuffle_v3bf16_v2bf16__2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v1
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v0, v1, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -1895,9 +1882,8 @@ define void @v_shuffle_v3bf16_v2bf16__2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v1
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v0, v1, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1908,9 +1894,8 @@ define void @v_shuffle_v3bf16_v2bf16__2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v1
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x bfloat> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll
index cd4dbe93e8a11..771c233df116a 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll
@@ -1562,9 +1562,8 @@ define void @v_shuffle_v3bf16_v3bf16__u_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -1575,9 +1574,8 @@ define void @v_shuffle_v3bf16_v3bf16__u_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1588,9 +1586,8 @@ define void @v_shuffle_v3bf16_v3bf16__u_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX942-NEXT: global_store_short v2, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x bfloat> asm "; def $0", "=v"()
@@ -1608,9 +1605,8 @@ define void @v_shuffle_v3bf16_v3bf16__0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -1621,9 +1617,8 @@ define void @v_shuffle_v3bf16_v3bf16__0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1634,9 +1629,8 @@ define void @v_shuffle_v3bf16_v3bf16__0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX942-NEXT: global_store_short v2, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x bfloat> asm "; def $0", "=v"()
@@ -1758,9 +1752,8 @@ define void @v_shuffle_v3bf16_v3bf16__3_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -1771,9 +1764,8 @@ define void @v_shuffle_v3bf16_v3bf16__3_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1784,9 +1776,8 @@ define void @v_shuffle_v3bf16_v3bf16__3_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX942-NEXT: global_store_short v2, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x bfloat> asm "; def $0", "=v"()
@@ -3418,9 +3409,8 @@ define void @v_shuffle_v3bf16_v3bf16__u_4_4(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -3431,9 +3421,8 @@ define void @v_shuffle_v3bf16_v3bf16__u_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3444,9 +3433,8 @@ define void @v_shuffle_v3bf16_v3bf16__u_4_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX942-NEXT: global_store_short v2, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x bfloat> asm "; def $0", "=v"()
@@ -3658,9 +3646,8 @@ define void @v_shuffle_v3bf16_v3bf16__3_4_4(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -3671,9 +3658,8 @@ define void @v_shuffle_v3bf16_v3bf16__3_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3684,9 +3670,8 @@ define void @v_shuffle_v3bf16_v3bf16__3_4_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX942-NEXT: global_store_short v2, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x bfloat> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v4bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v4bf16.ll
index 311ca98227da3..e9859ec1e5119 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v4bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v4bf16.ll
@@ -2008,9 +2008,8 @@ define void @v_shuffle_v3bf16_v4bf16__u_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2021,9 +2020,8 @@ define void @v_shuffle_v3bf16_v4bf16__u_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2034,9 +2032,8 @@ define void @v_shuffle_v3bf16_v4bf16__u_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX942-NEXT: global_store_short v2, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x bfloat> asm "; def $0", "=v"()
@@ -2053,9 +2050,8 @@ define void @v_shuffle_v3bf16_v4bf16__0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2066,9 +2062,8 @@ define void @v_shuffle_v3bf16_v4bf16__0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2079,9 +2074,8 @@ define void @v_shuffle_v3bf16_v4bf16__0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX942-NEXT: global_store_short v2, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x bfloat> asm "; def $0", "=v"()
@@ -2251,9 +2245,8 @@ define void @v_shuffle_v3bf16_v4bf16__4_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2264,9 +2257,8 @@ define void @v_shuffle_v3bf16_v4bf16__4_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2277,9 +2269,8 @@ define void @v_shuffle_v3bf16_v4bf16__4_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX942-NEXT: global_store_short v2, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x bfloat> asm "; def $0", "=v"()
@@ -5314,9 +5305,8 @@ define void @v_shuffle_v3bf16_v4bf16__u_5_5(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -5327,9 +5317,8 @@ define void @v_shuffle_v3bf16_v4bf16__u_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5340,9 +5329,8 @@ define void @v_shuffle_v3bf16_v4bf16__u_5_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX942-NEXT: global_store_short v2, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x bfloat> asm "; def $0", "=v"()
@@ -5608,9 +5596,8 @@ define void @v_shuffle_v3bf16_v4bf16__4_5_5(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -5621,9 +5608,8 @@ define void @v_shuffle_v3bf16_v4bf16__4_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5634,9 +5620,8 @@ define void @v_shuffle_v3bf16_v4bf16__4_5_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX942-NEXT: global_store_short v2, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x bfloat> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll
index 99cb8a38f57c3..3398894e478dd 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll
@@ -985,9 +985,8 @@ define void @v_shuffle_v3f16_v2f16__u_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v1
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v0, v1, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -998,9 +997,8 @@ define void @v_shuffle_v3f16_v2f16__u_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v1
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v0, v1, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1011,9 +1009,8 @@ define void @v_shuffle_v3f16_v2f16__u_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v1
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x half> asm "; def $0", "=v"()
@@ -1030,9 +1027,8 @@ define void @v_shuffle_v3f16_v2f16__0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v1
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v0, v1, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -1043,9 +1039,8 @@ define void @v_shuffle_v3f16_v2f16__0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v1
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v0, v1, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1056,9 +1051,8 @@ define void @v_shuffle_v3f16_v2f16__0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v1
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x half> asm "; def $0", "=v"()
@@ -1126,9 +1120,8 @@ define void @v_shuffle_v3f16_v2f16__2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v1
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v0, v1, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -1139,9 +1132,8 @@ define void @v_shuffle_v3f16_v2f16__2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v1
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v0, v1, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1152,9 +1144,8 @@ define void @v_shuffle_v3f16_v2f16__2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v1
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x half> asm "; def $0", "=v"()
@@ -1713,9 +1704,8 @@ define void @v_shuffle_v3f16_v2f16__u_3_3(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v1
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v0, v1, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -1726,9 +1716,8 @@ define void @v_shuffle_v3f16_v2f16__u_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v1
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v0, v1, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1739,9 +1728,8 @@ define void @v_shuffle_v3f16_v2f16__u_3_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v1
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x half> asm "; def $0", "=v"()
@@ -1882,9 +1870,8 @@ define void @v_shuffle_v3f16_v2f16__2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v1
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v0, v1, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -1895,9 +1882,8 @@ define void @v_shuffle_v3f16_v2f16__2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v1
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v0, v1, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1908,9 +1894,8 @@ define void @v_shuffle_v3f16_v2f16__2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v1
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x half> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll
index 0854ff2ebfc5d..b8115490be536 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll
@@ -1562,9 +1562,8 @@ define void @v_shuffle_v3f16_v3f16__u_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -1575,9 +1574,8 @@ define void @v_shuffle_v3f16_v3f16__u_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1588,9 +1586,8 @@ define void @v_shuffle_v3f16_v3f16__u_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX942-NEXT: global_store_short v2, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x half> asm "; def $0", "=v"()
@@ -1608,9 +1605,8 @@ define void @v_shuffle_v3f16_v3f16__0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -1621,9 +1617,8 @@ define void @v_shuffle_v3f16_v3f16__0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1634,9 +1629,8 @@ define void @v_shuffle_v3f16_v3f16__0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX942-NEXT: global_store_short v2, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x half> asm "; def $0", "=v"()
@@ -1758,9 +1752,8 @@ define void @v_shuffle_v3f16_v3f16__3_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -1771,9 +1764,8 @@ define void @v_shuffle_v3f16_v3f16__3_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1784,9 +1776,8 @@ define void @v_shuffle_v3f16_v3f16__3_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX942-NEXT: global_store_short v2, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x half> asm "; def $0", "=v"()
@@ -3418,9 +3409,8 @@ define void @v_shuffle_v3f16_v3f16__u_4_4(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -3431,9 +3421,8 @@ define void @v_shuffle_v3f16_v3f16__u_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3444,9 +3433,8 @@ define void @v_shuffle_v3f16_v3f16__u_4_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX942-NEXT: global_store_short v2, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x half> asm "; def $0", "=v"()
@@ -3658,9 +3646,8 @@ define void @v_shuffle_v3f16_v3f16__3_4_4(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -3671,9 +3658,8 @@ define void @v_shuffle_v3f16_v3f16__3_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3684,9 +3670,8 @@ define void @v_shuffle_v3f16_v3f16__3_4_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX942-NEXT: global_store_short v2, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x half> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v4f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v4f16.ll
index ecc7ff618932b..3b262a65b9f82 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v4f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v4f16.ll
@@ -2008,9 +2008,8 @@ define void @v_shuffle_v3f16_v4f16__u_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2021,9 +2020,8 @@ define void @v_shuffle_v3f16_v4f16__u_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2034,9 +2032,8 @@ define void @v_shuffle_v3f16_v4f16__u_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX942-NEXT: global_store_short v2, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x half> asm "; def $0", "=v"()
@@ -2053,9 +2050,8 @@ define void @v_shuffle_v3f16_v4f16__0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2066,9 +2062,8 @@ define void @v_shuffle_v3f16_v4f16__0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2079,9 +2074,8 @@ define void @v_shuffle_v3f16_v4f16__0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX942-NEXT: global_store_short v2, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x half> asm "; def $0", "=v"()
@@ -2251,9 +2245,8 @@ define void @v_shuffle_v3f16_v4f16__4_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2264,9 +2257,8 @@ define void @v_shuffle_v3f16_v4f16__4_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2277,9 +2269,8 @@ define void @v_shuffle_v3f16_v4f16__4_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX942-NEXT: global_store_short v2, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x half> asm "; def $0", "=v"()
@@ -5314,9 +5305,8 @@ define void @v_shuffle_v3f16_v4f16__u_5_5(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -5327,9 +5317,8 @@ define void @v_shuffle_v3f16_v4f16__u_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5340,9 +5329,8 @@ define void @v_shuffle_v3f16_v4f16__u_5_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX942-NEXT: global_store_short v2, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x half> asm "; def $0", "=v"()
@@ -5608,9 +5596,8 @@ define void @v_shuffle_v3f16_v4f16__4_5_5(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -5621,9 +5608,8 @@ define void @v_shuffle_v3f16_v4f16__4_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v2, v0, s[16:17]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5634,9 +5620,8 @@ define void @v_shuffle_v3f16_v4f16__4_5_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX942-NEXT: global_store_short v2, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x half> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index 4fc1e06a14983..9f805c19d680a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -263,21 +263,21 @@ define void @vst3_v2i16(ptr %src, ptr %dst) {
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: ldrh r2, [r0, #10]
; CHECK-NEXT: ldrh r4, [r0, #8]
+; CHECK-NEXT: ldrh r3, [r0, #6]
; CHECK-NEXT: ldrh.w r12, [r0, #2]
-; CHECK-NEXT: ldrh.w lr, [r0]
; CHECK-NEXT: vmov q0[2], q0[0], r4, r2
-; CHECK-NEXT: ldrh r3, [r0, #6]
+; CHECK-NEXT: ldrh.w lr, [r0]
; CHECK-NEXT: ldrh r0, [r0, #4]
; CHECK-NEXT: vmov q1[2], q1[0], r0, r3
-; CHECK-NEXT: vmov q2, q1
-; CHECK-NEXT: vmovnt.i32 q2, q0
+; CHECK-NEXT: vmovnt.i32 q1, q0
; CHECK-NEXT: vmov q0[2], q0[0], lr, r12
-; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vmov r2, s6
+; CHECK-NEXT: vmov.32 q1[0], r0
; CHECK-NEXT: vmov.f32 s1, s4
; CHECK-NEXT: vmov.f32 s3, s2
; CHECK-NEXT: vmov.32 q0[2], r4
; CHECK-NEXT: vstrh.32 q0, [r1]
-; CHECK-NEXT: str r0, [r1, #8]
+; CHECK-NEXT: str r2, [r1, #8]
; CHECK-NEXT: pop {r4, pc}
entry:
%l1 = load <2 x i16>, ptr %src, align 4
@@ -1177,20 +1177,20 @@ define void @vst3_v2f16(ptr %src, ptr %dst) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldrd r2, r3, [r0]
; CHECK-NEXT: ldr r0, [r0, #8]
-; CHECK-NEXT: vmov.32 q1[0], r2
-; CHECK-NEXT: vmov q0, q1
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vmov.32 q2[0], r0
-; CHECK-NEXT: vmov.32 q0[1], r3
-; CHECK-NEXT: vmovx.f16 s2, s4
-; CHECK-NEXT: vmov.f32 s0, s4
-; CHECK-NEXT: vmovx.f16 s4, s8
-; CHECK-NEXT: vins.f16 s8, s2
-; CHECK-NEXT: vmovx.f16 s2, s1
-; CHECK-NEXT: vins.f16 s0, s1
-; CHECK-NEXT: vmov.f32 s1, s8
-; CHECK-NEXT: vins.f16 s2, s4
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r0, r2, d0
+; CHECK-NEXT: vmov.32 q1[1], r3
+; CHECK-NEXT: vmovx.f16 s2, s8
+; CHECK-NEXT: vmov.f32 s4, s0
+; CHECK-NEXT: vmovx.f16 s0, s0
+; CHECK-NEXT: vins.f16 s8, s0
+; CHECK-NEXT: vmovx.f16 s6, s5
+; CHECK-NEXT: vins.f16 s4, s5
+; CHECK-NEXT: vmov.f32 s5, s8
+; CHECK-NEXT: vins.f16 s6, s2
+; CHECK-NEXT: vmov r3, s6
+; CHECK-NEXT: vmov r0, r2, d2
; CHECK-NEXT: stm r1!, {r0, r2, r3}
; CHECK-NEXT: bx lr
entry:
diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
index 5e6c11e7fa100..df59a922fa40e 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
@@ -1487,27 +1487,27 @@ define <8 x half> @movsh(<8 x half> %a, <8 x half> %b) {
; X64-NOVL-NEXT: vaddsh %xmm4, %xmm5, %xmm4
; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; X64-NOVL-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-NOVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; X64-NOVL-NEXT: vaddsh %xmm4, %xmm0, %xmm0
-; X64-NOVL-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0]
-; X64-NOVL-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
+; X64-NOVL-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[3,3,3,3,4,5,6,7]
; X64-NOVL-NEXT: vaddsh %xmm4, %xmm5, %xmm4
-; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; X64-NOVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; X64-NOVL-NEXT: vpsrlq $48, %xmm1, %xmm3
-; X64-NOVL-NEXT: vpsrlq $48, %xmm2, %xmm4
-; X64-NOVL-NEXT: vaddsh %xmm3, %xmm4, %xmm3
-; X64-NOVL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; X64-NOVL-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; X64-NOVL-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
+; X64-NOVL-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1]
+; X64-NOVL-NEXT: vaddsh %xmm5, %xmm6, %xmm5
+; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; X64-NOVL-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X64-NOVL-NEXT: vpsrlq $48, %xmm1, %xmm4
+; X64-NOVL-NEXT: vpsrlq $48, %xmm2, %xmm5
; X64-NOVL-NEXT: vaddsh %xmm4, %xmm5, %xmm4
-; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; X64-NOVL-NEXT: vaddsh %xmm1, %xmm2, %xmm4
+; X64-NOVL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; X64-NOVL-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
+; X64-NOVL-NEXT: vaddsh %xmm5, %xmm6, %xmm5
+; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; X64-NOVL-NEXT: vaddsh %xmm1, %xmm0, %xmm0
; X64-NOVL-NEXT: vpsrld $16, %xmm1, %xmm1
; X64-NOVL-NEXT: vpsrld $16, %xmm2, %xmm2
; X64-NOVL-NEXT: vaddsh %xmm1, %xmm2, %xmm1
-; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; X64-NOVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; X64-NOVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NOVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; X64-NOVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; X64-NOVL-NEXT: retq
%res1 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 7, i32 0, i32 1, i32 2, i32 3, i32 7, i32 5>
%res2 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll
index ce68eebd5b752..7a1535bd6b3f1 100644
--- a/llvm/test/CodeGen/X86/extractelement-load.ll
+++ b/llvm/test/CodeGen/X86/extractelement-load.ll
@@ -558,13 +558,14 @@ define dso_local <2 x float> @multiuse_of_single_value_from_vbroadcast_load(ptr
; X64-SSSE3-NEXT: pushq %rbx
; X64-SSSE3-NEXT: subq $16, %rsp
; X64-SSSE3-NEXT: movq %rsi, %rbx
-; X64-SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
-; X64-SSSE3-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill
-; X64-SSSE3-NEXT: movlpd %xmm0, (%rdi)
+; X64-SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-SSSE3-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; X64-SSSE3-NEXT: movsd %xmm0, (%rdi)
; X64-SSSE3-NEXT: movaps 32(%rsi), %xmm0
; X64-SSSE3-NEXT: callq ccosf at PLT
; X64-SSSE3-NEXT: movlps %xmm0, 32(%rbx)
-; X64-SSSE3-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; X64-SSSE3-NEXT: movddup (%rsp), %xmm0 # 16-byte Folded Reload
+; X64-SSSE3-NEXT: # xmm0 = mem[0,0]
; X64-SSSE3-NEXT: callq ccosf at PLT
; X64-SSSE3-NEXT: addq $16, %rsp
; X64-SSSE3-NEXT: popq %rbx
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index b6a4a12eb0fac..a66e584380d26 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -1114,8 +1114,8 @@ define void @main.45() #0 {
; CHECK-LIBCALL-NEXT: pushq %rbx
; CHECK-LIBCALL-NEXT: subq $16, %rsp
; CHECK-LIBCALL-NEXT: pinsrw $0, (%rax), %xmm0
+; CHECK-LIBCALL-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
; CHECK-LIBCALL-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-LIBCALL-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
; CHECK-LIBCALL-NEXT: movq %xmm1, %rbx
; CHECK-LIBCALL-NEXT: movq %rbx, %r14
; CHECK-LIBCALL-NEXT: shrq $48, %r14
@@ -1127,14 +1127,14 @@ define void @main.45() #0 {
; CHECK-LIBCALL-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax
; CHECK-LIBCALL-NEXT: movl $32256, %ecx # imm = 0x7E00
-; CHECK-LIBCALL-NEXT: cmovpl %ecx, %eax
; CHECK-LIBCALL-NEXT: cmovpl %ecx, %ebx
; CHECK-LIBCALL-NEXT: cmovpl %ecx, %r15d
; CHECK-LIBCALL-NEXT: cmovpl %ecx, %r14d
+; CHECK-LIBCALL-NEXT: cmovpl %ecx, %eax
+; CHECK-LIBCALL-NEXT: movw %ax, (%rax)
; CHECK-LIBCALL-NEXT: movw %r14w, (%rax)
; CHECK-LIBCALL-NEXT: movw %r15w, (%rax)
; CHECK-LIBCALL-NEXT: movw %bx, (%rax)
-; CHECK-LIBCALL-NEXT: movw %ax, (%rax)
; CHECK-LIBCALL-NEXT: addq $16, %rsp
; CHECK-LIBCALL-NEXT: popq %rbx
; CHECK-LIBCALL-NEXT: popq %r14
diff --git a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
index 4639650480494..3ece4beb9c22e 100644
--- a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
+++ b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
@@ -312,32 +312,30 @@ define <1 x i64> @test_scalar_no_ifma(<1 x i64> %x, <1 x i64> %y, <1 x i64> %z)
define <8 x i64> @test_mixed_width_too_wide(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
; AVXIFMA-LABEL: test_mixed_width_too_wide:
; AVXIFMA: # %bb.0:
-; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm6 = [255,255,255,255,255,0,0,0,255,255,255,255,255,0,0,0,255,255,255,255,255,0,0,0,255,255,255,255,255,0,0,0]
-; AVXIFMA-NEXT: vpand %ymm6, %ymm0, %ymm7
-; AVXIFMA-NEXT: vpand %ymm6, %ymm1, %ymm6
-; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm8 = [8191,8191,8191,8191]
-; AVXIFMA-NEXT: vpand %ymm2, %ymm8, %ymm2
-; AVXIFMA-NEXT: vpand %ymm3, %ymm8, %ymm3
-; AVXIFMA-NEXT: vpsrlq $32, %ymm6, %ymm6
-; AVXIFMA-NEXT: vpmuludq %ymm3, %ymm6, %ymm6
-; AVXIFMA-NEXT: vpsllq $32, %ymm6, %ymm6
+; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8191,8191,8191,8191]
+; AVXIFMA-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVXIFMA-NEXT: vpand %ymm6, %ymm3, %ymm3
+; AVXIFMA-NEXT: vpmovzxdq {{.*#+}} ymm6 = [2155905028,2155905036,2155905044,2155905052]
+; AVXIFMA-NEXT: vpshufb %ymm6, %ymm1, %ymm7
+; AVXIFMA-NEXT: vpmuludq %ymm3, %ymm7, %ymm7
+; AVXIFMA-NEXT: vpsllq $32, %ymm7, %ymm7
; AVXIFMA-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
-; AVXIFMA-NEXT: vpsrlq $32, %ymm7, %ymm3
+; AVXIFMA-NEXT: vpshufb %ymm6, %ymm0, %ymm3
; AVXIFMA-NEXT: vpmuludq %ymm2, %ymm3, %ymm3
; AVXIFMA-NEXT: vpsllq $32, %ymm3, %ymm3
; AVXIFMA-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
; AVXIFMA-NEXT: vpaddq %ymm0, %ymm4, %ymm0
; AVXIFMA-NEXT: vpaddq %ymm3, %ymm0, %ymm0
; AVXIFMA-NEXT: vpaddq %ymm1, %ymm5, %ymm1
-; AVXIFMA-NEXT: vpaddq %ymm6, %ymm1, %ymm1
+; AVXIFMA-NEXT: vpaddq %ymm7, %ymm1, %ymm1
; AVXIFMA-NEXT: retq
;
; AVX512-LABEL: test_mixed_width_too_wide:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm3
; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0
+; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0
; AVX512-NEXT: vpaddq %zmm3, %zmm2, %zmm1
@@ -346,10 +344,10 @@ define <8 x i64> @test_mixed_width_too_wide(<8 x i64> %x, <8 x i64> %y, <8 x i64
;
; AVX512-NOIFMA-LABEL: test_mixed_width_too_wide:
; AVX512-NOIFMA: # %bb.0:
-; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm0, %zmm3
; AVX512-NOIFMA-NEXT: vpsrlq $32, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: vpsllq $32, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: vpaddq %zmm3, %zmm2, %zmm1
diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll
index 0b9029d47b24d..5e35d9c204b1a 100644
--- a/llvm/test/CodeGen/X86/movmsk-cmp.ll
+++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll
@@ -4533,7 +4533,6 @@ define i32 @pr67287(<2 x i64> %broadcast.splatinsert25) {
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
-; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: movmskpd %xmm0, %eax
; SSE2-NEXT: testl %eax, %eax
; SSE2-NEXT: setne %al
diff --git a/llvm/test/CodeGen/X86/pr107423.ll b/llvm/test/CodeGen/X86/pr107423.ll
index 5c43276f0dc38..d5119d45f97c0 100644
--- a/llvm/test/CodeGen/X86/pr107423.ll
+++ b/llvm/test/CodeGen/X86/pr107423.ll
@@ -7,21 +7,21 @@ define void @PR107423(<64 x i8> %arg, ptr %p0) {
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vpsllw $8, %xmm2, %xmm2
; CHECK-NEXT: vpsllw $8, %xmm1, %xmm3
-; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm4
-; CHECK-NEXT: vpaddb %xmm2, %xmm3, %xmm2
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
-; CHECK-NEXT: vpaddb %xmm3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddb %xmm1, %xmm3, %xmm1
+; CHECK-NEXT: vpaddb %xmm2, %xmm3, %xmm3
+; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm4
+; CHECK-NEXT: vpaddb %xmm1, %xmm4, %xmm1
+; CHECK-NEXT: vpaddb %xmm4, %xmm0, %xmm4
+; CHECK-NEXT: vpsllw $8, %xmm4, %xmm4
+; CHECK-NEXT: vpaddb %xmm3, %xmm4, %xmm3
; CHECK-NEXT: vpsllw $8, %xmm1, %xmm1
-; CHECK-NEXT: vpaddb %xmm4, %xmm1, %xmm1
-; CHECK-NEXT: vpsllw $8, %xmm0, %xmm0
-; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: vpsllw $8, %xmm1, %xmm2
-; CHECK-NEXT: vpaddb %xmm1, %xmm2, %xmm1
-; CHECK-NEXT: vmovdqu %xmm0, (%rdi)
-; CHECK-NEXT: vmovdqu %xmm1, 16(%rdi)
+; CHECK-NEXT: vpor %xmm3, %xmm2, %xmm2
+; CHECK-NEXT: vpsllw $8, %xmm0, %xmm0
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vmovdqu %xmm0, 16(%rdi)
+; CHECK-NEXT: vmovdqu %xmm2, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%i3 = bitcast <64 x i8> %arg to <32 x i16>
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index 038c73bd9fed2..c82e2b322e55a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -2803,17 +2803,16 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1,2,3,4,5,6,7],ymm12[8],ymm10[9,10,11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm13, %xmm14
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7,8,9,10,11,12,13],ymm11[14],ymm12[15]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],mem[6],xmm11[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12
@@ -2971,9 +2970,8 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7,8,9,10,11,12,13],ymm15[14],ymm14[15]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm15 = xmm14[0,1,2,3,4,5],mem[6],xmm14[7]
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm15, %xmm4
@@ -5563,112 +5561,108 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-LABEL: load_i16_stride7_vf32:
; AVX2: # %bb.0:
; AVX2-NEXT: subq $520, %rsp # imm = 0x208
-; AVX2-NEXT: vmovdqa 288(%rdi), %ymm14
-; AVX2-NEXT: vmovdqa 320(%rdi), %ymm9
+; AVX2-NEXT: vmovdqa 288(%rdi), %ymm10
+; AVX2-NEXT: vmovdqa 320(%rdi), %ymm14
; AVX2-NEXT: vmovdqa 256(%rdi), %ymm5
-; AVX2-NEXT: vmovdqa 224(%rdi), %ymm13
-; AVX2-NEXT: vmovdqa (%rdi), %ymm7
+; AVX2-NEXT: vmovdqa 224(%rdi), %ymm11
+; AVX2-NEXT: vmovdqa (%rdi), %ymm15
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm6
-; AVX2-NEXT: vmovdqa 64(%rdi), %ymm11
+; AVX2-NEXT: vmovdqa 64(%rdi), %ymm9
; AVX2-NEXT: vmovdqa 96(%rdi), %ymm8
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm11[1],ymm8[2,3,4],ymm11[5],ymm8[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19]
; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm6[2],ymm15[3,4,5],ymm6[6],ymm15[7]
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm3
-; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm1
-; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,0]
+; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqa %ymm5, %ymm3
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2],ymm13[3,4,5],ymm5[6],ymm13[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm5[2],ymm11[3,4,5],ymm5[6],ymm11[7]
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5
; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7]
; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm14[1],ymm9[2,3,4],ymm14[5],ymm9[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21]
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7]
-; AVX2-NEXT: vmovdqa %ymm13, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7]
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT: vpblendvb %ymm12, %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15]
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2],ymm6[3],ymm15[4,5],ymm6[6],ymm15[7]
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7]
; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm11[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7]
-; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
+; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm15[1],ymm6[2,3],ymm15[4],ymm6[5,6,7]
; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23]
-; AVX2-NEXT: vpshufb %ymm15, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23]
+; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm13
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7]
-; AVX2-NEXT: vmovdqa %ymm1, %ymm4
-; AVX2-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
-; AVX2-NEXT: vmovdqa %ymm3, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm13
+; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm11[1],ymm3[2,3],ymm11[4],ymm3[5,6,7]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa %ymm9, %ymm3
-; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm14[2,3],ymm9[4,5],ymm14[6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm14[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm10[2,3],ymm14[4,5],ymm10[6,7]
; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm10[2,3,0,1]
+; AVX2-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill
; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7,8,9,10,11],ymm5[12],ymm2[13,14,15]
-; AVX2-NEXT: vpshufb %ymm15, %ymm2, %ymm2
-; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm10
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7]
-; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm12
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm15[1],ymm6[2,3,4],ymm15[5],ymm6[6,7]
+; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7]
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
-; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
+; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
-; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm9
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7]
+; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm9
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm11[1],ymm3[2,3,4],ymm11[5],ymm3[6,7]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7]
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
-; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
-; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm8
-; AVX2-NEXT: vmovdqa 128(%rdi), %ymm12
-; AVX2-NEXT: vmovdqa 160(%rdi), %ymm15
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm12[3],ymm15[4,5],ymm12[6],ymm15[7]
+; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm8
+; AVX2-NEXT: vmovdqa 128(%rdi), %ymm7
+; AVX2-NEXT: vmovdqa 160(%rdi), %ymm11
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm7[3],ymm11[4,5],ymm7[6],ymm11[7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
@@ -5713,7 +5707,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm15[2],ymm12[3,4,5],ymm15[6],ymm12[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm11[2],ymm7[3,4,5],ymm11[6],ymm7[7]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
; AVX2-NEXT: vpshufb %xmm14, %xmm0, %xmm0
@@ -5725,7 +5719,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
@@ -5747,64 +5741,76 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2,3,4,5,6,7],ymm10[8],ymm0[9,10,11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm0[1,2,3,4,5,6,7],ymm12[8],ymm0[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,2,0,4,5,6,4]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm5[0,1,1,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm3[0,1,1,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7]
-; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
-; AVX2-NEXT: vpshufb %xmm1, %xmm3, %xmm3
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,1,4,5,6,5]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm3 = mem[0,1],ymm7[2],mem[3,4],ymm7[5],mem[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1,2,3,4,5,6,7],ymm8[8],ymm2[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm7[1],ymm11[2,3,4],ymm7[5],ymm11[6,7]
+; AVX2-NEXT: vmovdqa %ymm7, %ymm9
+; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,1,4,5,6,5]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm2 = mem[0,1],ymm15[2],mem[3,4],ymm15[5],mem[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5
; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm5 = mem[0,1],ymm14[2],mem[3,4,5],ymm14[6],mem[7]
+; AVX2-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm5 = ymm14[0,1],mem[2],ymm14[3,4,5],mem[6],ymm14[7]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
-; AVX2-NEXT: vpshufb %ymm7, %ymm5, %ymm8
-; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1,2,3,4,5,6,7],ymm8[8],ymm2[9,10,11,12,13,14,15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6],ymm8[7,8,9,10,11,12,13],ymm5[14],ymm8[15]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload
+; AVX2-NEXT: vpshufb %ymm7, %ymm5, %ymm5
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],mem[6],xmm5[7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,5]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm10[2],ymm5[3,4,5],ymm10[6],ymm5[7]
+; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7]
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
@@ -5812,25 +5818,12 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7]
-; AVX2-NEXT: vpshufb %ymm7, %ymm3, %ymm5
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6],ymm5[7,8,9,10,11,12,13],ymm3[14],ymm5[15]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm7
-; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5,6,7]
-; AVX2-NEXT: vpshufb %xmm1, %xmm3, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,5]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],mem[6],xmm1[7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm10[3],ymm5[4,5],ymm10[6],ymm5[7]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7]
@@ -5859,7 +5852,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7]
+; AVX2-NEXT: vmovdqa %ymm11, %ymm15
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm7
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3],xmm1[4],xmm7[5,6,7]
; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
@@ -5872,14 +5866,14 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm12[5,6,7],ymm5[8,9,10,11,12],ymm12[13,14,15]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm14[3],ymm7[4,5],ymm14[6],ymm7[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2],ymm7[3],ymm14[4,5],ymm7[6],ymm14[7]
; AVX2-NEXT: vmovdqa %ymm14, %ymm13
; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1]
; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3,4,5,6],ymm14[7,8],ymm12[9,10,11,12,13,14],ymm14[15]
; AVX2-NEXT: vpshufb %ymm8, %ymm12, %ymm8
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm9[2,3],ymm2[4,5],ymm9[6,7]
; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm14
; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7]
@@ -5889,7 +5883,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1,2,3,4,5,6,7],ymm8[8],ymm5[9,10,11,12,13,14,15]
; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT: vpblendd $18, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload
; AVX2-NEXT: # ymm8 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1]
; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2,3,4,5,6,7,8],ymm10[9],ymm8[10,11,12,13,14,15]
@@ -5904,7 +5898,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX2-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX2-NEXT: # ymm4 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7]
@@ -5928,11 +5922,11 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7],ymm3[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm13[1],ymm7[2,3],ymm13[4],ymm7[5,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6,7,8],ymm4[9],ymm3[10,11,12,13,14,15]
; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm3
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm9[3],ymm2[4,5],ymm9[6],ymm2[7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7]
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4
; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
@@ -5975,10 +5969,10 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-LABEL: load_i16_stride7_vf32:
; AVX2-FP: # %bb.0:
; AVX2-FP-NEXT: subq $552, %rsp # imm = 0x228
-; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm11
-; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm13
+; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm10
+; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm11
; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm5
-; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm12
+; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm15
; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm8
; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm7
; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm6
@@ -5992,30 +5986,30 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
; AVX2-FP-NEXT: vpshufb %xmm4, %xmm1, %xmm3
-; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm3, %ymm2, %ymm1
-; AVX2-FP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm1
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm5[2],ymm12[3,4,5],ymm5[6],ymm12[7]
+; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,0]
+; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm3, %ymm2, %ymm14
+; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm3
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm5[2],ymm15[3,4,5],ymm5[6],ymm15[7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm5
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7]
; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2,3,4],ymm11[5],ymm13[6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7]
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
; AVX2-FP-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7]
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21]
; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm3[3],ymm15[4,5],ymm3[6],ymm15[7]
+; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm13
; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
; AVX2-FP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm4, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7]
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
@@ -6025,7 +6019,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm4
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7]
; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm6[2,3,0,1]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
@@ -6035,30 +6029,29 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm3
+; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm4
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23]
; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm0
-; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm11[2,3],ymm13[4,5],ymm11[6,7]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm11[2,3,0,1]
-; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm10[2,3,0,1]
+; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7,8,9,10,11],ymm5[12],ymm2[13,14,15]
; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm7
+; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm7
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3,4],ymm8[5],ymm3[6,7]
-; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm15
-; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7]
@@ -6067,18 +6060,18 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15]
; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm0
-; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm10
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7]
+; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm12
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm15[1],ymm13[2,3,4],ymm15[5],ymm13[6,7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm0
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7]
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm8
-; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm6
-; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm11
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm6[3],ymm11[4,5],ymm6[6],ymm11[7]
+; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm8
+; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm11
+; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm13
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7]
; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
@@ -6090,12 +6083,11 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm4
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
-; AVX2-FP-NEXT: vpblendd $31, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm9
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm6
; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm4
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm9[3],ymm4[4,5],ymm9[6],ymm4[7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm14
; AVX2-FP-NEXT: vpshufb %xmm3, %xmm14, %xmm3
; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
@@ -6103,28 +6095,28 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm1
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm1[0,1,0,2]
-; AVX2-FP-NEXT: vpshufb %ymm2, %ymm14, %ymm13
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm13[7]
+; AVX2-FP-NEXT: vpshufb %ymm2, %ymm14, %ymm15
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7]
; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4],ymm3[5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm4[2],ymm9[3,4,5],ymm4[6],ymm9[7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm13
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm13[4],xmm3[5],xmm13[6],xmm3[7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
-; AVX2-FP-NEXT: vpshufb %xmm13, %xmm3, %xmm3
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm4[2],ymm6[3,4,5],ymm4[6],ymm6[7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm15
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm15[4],xmm3[5],xmm15[6],xmm3[7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm15, %xmm3, %xmm3
; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
; AVX2-FP-NEXT: vpshufb %ymm2, %ymm14, %ymm14
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm3[1,2,3,4,5,6,7],ymm12[8],ymm3[9,10,11,12,13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0],ymm3[1,2,3,4,5,6,7],ymm10[8],ymm3[9,10,11,12,13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm11[2],ymm6[3,4,5],ymm11[6],ymm6[7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm13[2],ymm11[3,4,5],ymm13[6],ymm11[7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm14
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm14[4],xmm3[5],xmm14[6],xmm3[7]
-; AVX2-FP-NEXT: vpshufb %xmm13, %xmm3, %xmm3
+; AVX2-FP-NEXT: vpshufb %xmm15, %xmm3, %xmm3
; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
@@ -6132,32 +6124,33 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,1,1,2]
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31]
-; AVX2-FP-NEXT: vpshufb %ymm13, %ymm3, %ymm3
+; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31]
+; AVX2-FP-NEXT: vpshufb %ymm14, %ymm3, %ymm3
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm4[3],ymm9[4,5],ymm4[6],ymm9[7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,1,2]
-; AVX2-FP-NEXT: vpshufb %ymm13, %ymm2, %ymm2
+; AVX2-FP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5,6,7]
+; AVX2-FP-NEXT: vmovdqa %ymm11, %ymm14
; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
@@ -6167,10 +6160,10 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
; AVX2-FP-NEXT: vpshufb %ymm3, %ymm7, %ymm5
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2,3,4,5,6,7],ymm10[8],ymm0[9,10,11,12,13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm0[1,2,3,4,5,6,7],ymm12[8],ymm0[9,10,11,12,13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm9[1],ymm4[2,3],ymm9[4],ymm4[5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm5
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3,4,5],xmm5[6],xmm0[7]
; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
@@ -6181,120 +6174,118 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm2 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7]
-; AVX2-FP-NEXT: vmovd {{.*#+}} xmm14 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FP-NEXT: vpshufb %xmm14, %xmm2, %xmm3
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm2 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
+; AVX2-FP-NEXT: vmovd {{.*#+}} xmm5 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm3
; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm1[2],mem[3,4,5],ymm1[6],mem[7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
-; AVX2-FP-NEXT: vpshufb %ymm5, %ymm3, %ymm8
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6],ymm8[7,8,9,10,11,12,13],ymm3[14],ymm8[15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm6[1],ymm11[2,3,4],ymm6[5],ymm11[6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm10
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm3[1],xmm10[2],xmm3[3],xmm10[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
-; AVX2-FP-NEXT: vpshufb %xmm10, %xmm3, %xmm3
-; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
-; AVX2-FP-NEXT: vpshufb %ymm13, %ymm7, %ymm7
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1,2,3,4,5,6,7],ymm8[8],ymm3[9,10,11,12,13,14,15]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm9[2],ymm11[3,4,5],ymm9[6],ymm11[7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1,2,3,4,5],mem[6],xmm3[7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7]
+; AVX2-FP-NEXT: vmovdqa %ymm14, %ymm15
+; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm10
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5,6,7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm7, %ymm7
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5,6],ymm7[7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1,2,3,4,5,6,7],ymm3[8],ymm7[9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7]
-; AVX2-FP-NEXT: vpshufb %xmm14, %xmm2, %xmm3
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm12[2],ymm1[3,4],ymm12[5],ymm1[6,7]
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm3
; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm1[2],ymm8[3,4,5],ymm1[6],ymm8[7]
-; AVX2-FP-NEXT: vpshufb %ymm5, %ymm3, %ymm5
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6],ymm5[7,8,9,10,11,12,13],ymm3[14],ymm5[15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm7
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm10, %xmm3, %xmm3
-; AVX2-FP-NEXT: vpshufb %ymm13, %ymm0, %ymm0
-; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15]
+; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm8[2],ymm14[3,4,5],ymm8[6],ymm14[7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5],mem[6],xmm3[7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm7
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm0, %ymm0
+; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm1[3],ymm8[4,5],ymm1[6],ymm8[7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7]
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6],ymm2[7,8],ymm0[9,10,11,12,13,14],ymm2[15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4],xmm3[5,6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm0
-; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7
; AVX2-FP-NEXT: vmovdqa 432(%rdi), %xmm3
; AVX2-FP-NEXT: vmovdqa 416(%rdi), %xmm0
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13]
-; AVX2-FP-NEXT: vpshufb %xmm13, %xmm10, %xmm10
-; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm14
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13]
+; AVX2-FP-NEXT: vpshufb %xmm10, %xmm8, %xmm8
+; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm12[2,3],ymm1[4,5],ymm12[6,7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm14
; AVX2-FP-NEXT: vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-FP-NEXT: vpshufb %xmm1, %xmm14, %xmm14
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
+; AVX2-FP-NEXT: vmovd {{.*#+}} xmm12 = [10,11,6,7,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FP-NEXT: vpshufb %xmm12, %xmm8, %xmm8
+; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
; AVX2-FP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1,2,3,4,5,6,7],ymm2[8],ymm8[9,10,11,12,13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0],ymm7[1,2,3,4,5,6,7],ymm2[8],ymm7[9,10,11,12,13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm6[2],ymm11[3,4],ymm6[5],ymm11[6,7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm10
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3],xmm8[4],xmm10[5,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm8, %xmm10
-; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm15
-; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm12
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,5],xmm15[6],xmm12[7]
-; AVX2-FP-NEXT: vpshufb %xmm13, %xmm5, %xmm5
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3],xmm7[4],xmm8[5,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm7, %xmm8
+; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm5
+; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm7
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1,2,3,4,5],xmm5[6],xmm7[7]
+; AVX2-FP-NEXT: vpshufb %xmm10, %xmm15, %xmm10
+; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7],ymm10[8,9,10,11,12],ymm5[13,14,15]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm7[3],ymm11[4,5],ymm7[6],ymm11[7]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1,2,3,4,5,6],ymm13[7,8],ymm10[9,10,11,12,13,14],ymm13[15]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7]
+; AVX2-FP-NEXT: vmovdqa %ymm11, %ymm9
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0],ymm10[1,2,3,4,5,6],ymm15[7,8],ymm10[9,10,11,12,13,14],ymm15[15]
; AVX2-FP-NEXT: vpshufb %ymm14, %ymm10, %ymm10
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm14
-; AVX2-FP-NEXT: vpshufb %xmm1, %xmm14, %xmm14
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm5[1,2,3,4,5,6,7],ymm10[8],ymm5[9,10,11,12,13,14,15]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX2-FP-NEXT: vpshufb %xmm1, %xmm15, %xmm13
+; AVX2-FP-NEXT: vpshufb %xmm12, %xmm14, %xmm1
+; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3,4,5,6,7],ymm10[8],ymm8[9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5,6,7]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm5[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4,5,6,7,8],ymm10[9],ymm5[10,11,12,13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5,6,7]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2,3,4,5,6,7,8],ymm10[9],ymm8[10,11,12,13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm6
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5,6,7]
@@ -6309,33 +6300,34 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm4 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm10
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm10, %xmm10
+; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm12, %xmm10, %xmm10
; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
-; AVX2-FP-NEXT: vpshufb %ymm10, %ymm5, %ymm5
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm8, %ymm8
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX2-FP-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm8
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3],xmm8[4],xmm4[5],xmm8[6,7]
; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm7[1],xmm5[2,3,4,5,6,7]
; AVX2-FP-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5,6,7]
+; AVX2-FP-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm4 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5,6,7]
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7,8],ymm5[9],ymm4[10,11,12,13,14,15]
; AVX2-FP-NEXT: vpshufb %ymm10, %ymm4, %ymm4
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm8[3],ymm2[4,5],ymm8[6],ymm2[7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm2[3],ymm11[4,5],ymm2[6],ymm11[7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm6, %xmm6
+; AVX2-FP-NEXT: vpshufb %xmm12, %xmm6, %xmm6
; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7]
; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15]
@@ -6343,7 +6335,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rsi)
-; AVX2-FP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FP-NEXT: vmovaps %ymm2, (%rsi)
; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rdx)
@@ -11461,13 +11453,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX2-LABEL: load_i16_stride7_vf64:
; AVX2: # %bb.0:
-; AVX2-NEXT: subq $1448, %rsp # imm = 0x5A8
+; AVX2-NEXT: subq $1464, %rsp # imm = 0x5B8
; AVX2-NEXT: vmovdqa 256(%rdi), %ymm13
-; AVX2-NEXT: vmovdqa 224(%rdi), %ymm14
-; AVX2-NEXT: vmovdqa 512(%rdi), %ymm15
-; AVX2-NEXT: vmovdqa 544(%rdi), %ymm12
-; AVX2-NEXT: vmovdqa 480(%rdi), %ymm9
-; AVX2-NEXT: vmovdqa 448(%rdi), %ymm10
+; AVX2-NEXT: vmovdqa 224(%rdi), %ymm9
+; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqa 512(%rdi), %ymm8
+; AVX2-NEXT: vmovdqa 544(%rdi), %ymm7
+; AVX2-NEXT: vmovdqa 480(%rdi), %ymm11
+; AVX2-NEXT: vmovdqa 448(%rdi), %ymm12
; AVX2-NEXT: vmovdqa (%rdi), %ymm3
; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4
@@ -11487,207 +11480,207 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm4
; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,0]
; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm1, %ymm10
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm9[2],ymm10[3,4,5],ymm9[6],ymm10[7]
-; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7]
+; AVX2-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm15[1],ymm12[2,3,4],ymm15[5],ymm12[6,7]
-; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7]
+; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqa %ymm7, %ymm0
+; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm5
-; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm0
-; AVX2-NEXT: vmovdqa %ymm1, %ymm8
-; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7]
-; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm1
+; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm13[2],ymm9[3,4,5],ymm13[6],ymm9[7]
; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7]
; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
-; AVX2-NEXT: vmovdqa 288(%rdi), %ymm7
-; AVX2-NEXT: vmovdqa 320(%rdi), %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7]
-; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqa 288(%rdi), %ymm9
+; AVX2-NEXT: vmovdqa 320(%rdi), %ymm5
+; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0],ymm9[1],ymm5[2,3,4],ymm9[5],ymm5[6,7]
+; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
+; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm6
+; AVX2-NEXT: vpblendvb %ymm10, %ymm4, %ymm6, %ymm1
; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm5
-; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa 704(%rdi), %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa 672(%rdi), %ymm8
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm0[2],ymm8[3,4,5],ymm0[6],ymm8[7]
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
+; AVX2-NEXT: vmovdqa 704(%rdi), %ymm1
+; AVX2-NEXT: vmovdqa 672(%rdi), %ymm10
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm1[2],ymm10[3,4,5],ymm1[6],ymm10[7]
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7]
; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; AVX2-NEXT: vmovdqa 736(%rdi), %ymm11
-; AVX2-NEXT: vmovdqa 768(%rdi), %ymm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7]
+; AVX2-NEXT: vmovdqa 736(%rdi), %ymm15
+; AVX2-NEXT: vmovdqa 768(%rdi), %ymm14
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,0,0,0]
+; AVX2-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7,8,9,10],ymm3[11],ymm2[12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21]
-; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm5
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm6
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vmovdqa %ymm6, %ymm9
-; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7]
-; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7]
+; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm0
+; AVX2-NEXT: vmovdqa %ymm7, %ymm8
+; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm9[2],ymm5[3,4],ymm9[5],ymm5[6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3],ymm4[4,5,6,7,8,9,10],ymm6[11],ymm4[12,13,14,15]
+; AVX2-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm6 = mem[0,1,2],ymm13[3],mem[4,5],ymm13[6],mem[7]
+; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3,4,5],xmm6[6],xmm7[7]
; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm4
-; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm5
-; AVX2-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm4
-; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa %ymm11, %ymm15
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7]
-; AVX2-NEXT: vmovdqa %ymm0, %ymm14
+; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm6
+; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7]
-; AVX2-NEXT: vmovdqa %ymm8, %ymm13
-; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3],ymm4[4,5,6,7,8,9,10],ymm6[11],ymm4[12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm1[3],ymm10[4,5],ymm1[6],ymm10[7]
+; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqa %ymm10, %ymm5
+; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3,4,5],xmm6[6],xmm7[7]
; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm4
-; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm5
-; AVX2-NEXT: vmovdqa %ymm9, %ymm7
-; AVX2-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm0
+; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm6
+; AVX2-NEXT: vmovdqa %ymm8, %ymm13
+; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3],ymm4[4,5,6,7,8,9,10],ymm6[11],ymm4[12,13,14,15]
; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7]
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7]
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3,4,5],xmm4[6],xmm6[7]
; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm2
-; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm0
+; AVX2-NEXT: vpblendvb %ymm13, %ymm2, %ymm3, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm8[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm8[2,3],ymm11[4,5],ymm8[6,7]
; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4],ymm3[5,6,7,8,9,10,11],ymm2[12],ymm3[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6,7]
+; AVX2-NEXT: vmovdqa %ymm9, %ymm11
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23]
-; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm5
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm6
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
-; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm0
+; AVX2-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7]
+; AVX2-NEXT: vmovdqa %ymm12, %ymm0
+; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5,6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3,4,5],xmm6[6],xmm4[7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm9[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm12[2,3,0,1]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6,7,8,9,10,11],ymm7[12],ymm6[13,14,15]
; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
-; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm5
-; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4
+; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm6
+; AVX2-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm4
; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3,4,5],xmm6[6],xmm4[7]
+; AVX2-NEXT: vmovdqa %ymm14, %ymm12
+; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm15[2,3,0,1]
+; AVX2-NEXT: vmovdqa %ymm15, %ymm5
; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm15[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6,7,8,9,10,11],ymm7[12],ymm6[13,14,15]
; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
-; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm5
-; AVX2-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm6
+; AVX2-NEXT: vmovdqa %ymm13, %ymm7
+; AVX2-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm1
+; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7]
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3,4,5],xmm6[6],xmm4[7]
; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm15[2,3],ymm6[4,5],ymm15[6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm15[2,3],ymm1[4,5],ymm15[6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm15[2,3,0,1]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4],ymm4[5,6,7,8,9,10,11],ymm6[12],ymm4[13,14,15]
; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vmovdqa %ymm0, %ymm5
-; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7]
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm2 = ymm0[0,1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm2 = mem[0,1,2],ymm2[3],mem[4,5],ymm2[6],mem[7]
; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[1,3,2,3]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm4
; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
-; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3
; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm0[1],ymm8[2,3,4],ymm0[5],ymm8[6,7]
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7]
+; AVX2-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm4 = ymm9[0,1,2],mem[3],ymm9[4,5],mem[6],ymm9[7]
; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm4
-; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0
+; AVX2-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7]
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm15[3],ymm1[4,5],ymm15[6],ymm1[7]
; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm4
-; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0
+; AVX2-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm3 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm3 = mem[0],ymm0[1],mem[2,3,4],ymm0[5],mem[6,7]
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm4 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7]
; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm2
; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
-; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm0
+; AVX2-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa 128(%rdi), %ymm12
-; AVX2-NEXT: vmovdqa 160(%rdi), %ymm10
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7]
+; AVX2-NEXT: vmovdqa 128(%rdi), %ymm11
+; AVX2-NEXT: vmovdqa 160(%rdi), %ymm9
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,4,7]
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
@@ -11696,16 +11689,16 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-NEXT: vmovdqa 192(%rdi), %ymm0
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm0[0,1,0,2]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,1,2,1,4,5,6,5]
+; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,1,0,2]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,1,2,1,4,5,6,5]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa 576(%rdi), %ymm13
-; AVX2-NEXT: vmovdqa 608(%rdi), %ymm11
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7]
+; AVX2-NEXT: vmovdqa 576(%rdi), %ymm14
+; AVX2-NEXT: vmovdqa 608(%rdi), %ymm13
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
@@ -11714,67 +11707,53 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa 640(%rdi), %ymm1
; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,1,0,2]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,1,2,1,4,5,6,5]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,0,2]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,1,2,1,4,5,6,5]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-NEXT: vpblendd $31, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa 352(%rdi), %ymm5
-; AVX2-NEXT: vmovdqa 384(%rdi), %ymm6
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
-; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqa 352(%rdi), %ymm8
+; AVX2-NEXT: vmovdqa 384(%rdi), %ymm12
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,4,7]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa 416(%rdi), %ymm14
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,2]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[0,1,2,1,4,5,6,5]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm9[7]
-; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5
+; AVX2-NEXT: vmovdqa 416(%rdi), %ymm7
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,1,0,2]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,1,4,5,6,5]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm10[7]
+; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4],ymm5[5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa 800(%rdi), %ymm7
-; AVX2-NEXT: vmovdqa 832(%rdi), %ymm9
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,4,7]
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX2-NEXT: vmovdqa 800(%rdi), %ymm4
+; AVX2-NEXT: vmovdqa 832(%rdi), %ymm10
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,6,4,7]
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5
+; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,7]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa 864(%rdi), %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,0,2]
+; AVX2-NEXT: vmovdqa 864(%rdi), %ymm5
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm5[0,1,0,2]
; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,1,2,1,4,5,6,5]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm15[7]
; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm11[2],ymm13[3,4,5],ymm11[6],ymm13[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm15
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm15[4],xmm0[5],xmm15[6],xmm0[7]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
; AVX2-NEXT: vpshufb %xmm15, %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,5,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7]
-; AVX2-NEXT: vpshufb %xmm15, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
@@ -11782,11 +11761,23 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm9[2],ymm7[3,4,5],ymm9[6],ymm7[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm12[2],ymm8[3,4,5],ymm12[6],ymm8[7]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7]
; AVX2-NEXT: vpshufb %xmm15, %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm10[2],ymm4[3,4,5],ymm10[6],ymm4[7]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
+; AVX2-NEXT: vpshufb %xmm15, %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
@@ -11794,73 +11785,68 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm9[2],ymm11[3,4,5],ymm9[6],ymm11[7]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
; AVX2-NEXT: vpshufb %xmm15, %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[0,1,1,3,4,5,5,7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,1,1,3,4,5,5,7]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,1,2]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,1,1,2]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa %ymm11, %ymm6
-; AVX2-NEXT: vmovdqa %ymm13, %ymm4
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7]
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,1,2]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm15[0,1,1,2]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm9[3],ymm7[4,5],ymm9[6],ymm7[7]
-; AVX2-NEXT: vmovdqa %ymm7, %ymm5
-; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7]
+; AVX2-NEXT: vmovdqa %ymm4, %ymm6
+; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,1,2]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,2]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7]
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,1,1,2]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm7[0,1,1,2]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
@@ -11868,105 +11854,104 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5,6,7]
-; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5,6,7]
+; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,1,1,3]
-; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,1,2,0,4,5,6,4]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7]
-; AVX2-NEXT: vmovdqa %ymm4, %ymm8
-; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm13[0,1,1,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[0,1,2,0,4,5,6,4]
+; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm15[0,1,1,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,2,0,4,5,6,4]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm8[1],ymm12[2,3],ymm8[4],ymm12[5,6,7]
+; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,1,1,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,0,4,5,6,4]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,1,1,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,2,0,4,5,6,4]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm5[1],ymm9[2,3],ymm5[4],ymm9[5,6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm6[1],ymm10[2,3],ymm6[4],ymm10[5,6,7]
+; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,1,1,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,2,0,4,5,6,4]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm5[0,1,1,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,0,4,5,6,4]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
+; AVX2-NEXT: vpshufb %xmm1, %xmm5, %xmm5
+; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,1,4,5,6,5]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX2-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
-; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm5
-; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7,8,9,10,11,12,13],ymm4[14],ymm5[15]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6
-; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm3[1],xmm6[2],xmm3[3],xmm6[4,5,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
-; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm6
-; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,1,2,1,4,5,6,5]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3,4,5,6,7],ymm5[8],ymm6[9,10,11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7]
-; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm5 = mem[0,1,2,1,4,5,6,5]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1],ymm15[2],ymm14[3,4,5],ymm15[6],ymm14[7]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
+; AVX2-NEXT: vpshufb %ymm4, %ymm7, %ymm7
+; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6,7],ymm7[8],ymm5[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],mem[6],xmm7[7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm11[1],ymm9[2,3,4],ymm11[5],ymm9[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
+; AVX2-NEXT: vpshufb %xmm1, %xmm5, %xmm5
+; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,1,4,5,6,5]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
; AVX2-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7]
@@ -11978,88 +11963,83 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX2-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
; AVX2-NEXT: # ymm6 = mem[0,1],ymm6[2],mem[3,4,5],ymm6[6],mem[7]
-; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm7
-; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4,5,6,7],ymm7[8],ymm4[9,10,11,12,13,14,15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15]
+; AVX2-NEXT: vpshufb %ymm4, %ymm6, %ymm6
+; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1,2,3,4,5,6,7],ymm6[8],ymm3[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],mem[6],xmm6[7]
; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm4 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7]
-; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm6
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7,8,9,10,11,12,13],ymm5[14],ymm6[15]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT: vmovdqa %ymm11, %ymm8
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7
-; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7]
-; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5
-; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,1,4,5,6,5]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm4 = mem[0,1],ymm11[2],mem[3,4,5],ymm11[6],mem[7]
-; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6],ymm2[7,8,9,10,11,12,13],ymm4[14],ymm2[15]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-NEXT: vpblendd $221, (%rsp), %ymm7, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm4 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7]
-; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm8[1],ymm12[2,3,4],ymm8[5],ymm12[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5,6,7]
+; AVX2-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,1,4,5,6,5]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm6[2],ymm13[3,4,5],ymm6[6],ymm13[7]
+; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm5
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3,4,5,6,7],ymm5[8],ymm2[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],mem[6],xmm5[7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm2 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
+; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,5]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm7[2],ymm11[3,4,5],ymm7[6],ymm11[7]
+; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],mem[6],xmm1[7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
+; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vmovdqa 656(%rdi), %xmm13
-; AVX2-NEXT: vmovdqa 640(%rdi), %xmm14
-; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,5],xmm13[6],xmm14[7]
+; AVX2-NEXT: vmovdqa 656(%rdi), %xmm2
+; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovdqa 640(%rdi), %xmm15
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,5],xmm2[6],xmm15[7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload
; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7]
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
@@ -12074,96 +12054,95 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm6[3],ymm13[4,5],ymm6[6],ymm13[7]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3],xmm1[4],xmm4[5,6,7]
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vmovdqa 432(%rdi), %xmm4
-; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovdqa 416(%rdi), %xmm5
+; AVX2-NEXT: vmovdqa 432(%rdi), %xmm5
; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm4[6],xmm5[7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,6]
-; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7],ymm1[8,9,10,11,12],ymm5[13,14,15]
-; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm5 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vmovdqa 416(%rdi), %xmm4
+; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm5[6],xmm4[7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,6]
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7],ymm1[8,9,10,11,12],ymm4[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm7[3],ymm11[4,5],ymm7[6],ymm11[7]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
-; AVX2-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm11[2],ymm7[3,4],ymm11[5],ymm7[6,7]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5
; AVX2-NEXT: vmovdqa 880(%rdi), %xmm1
; AVX2-NEXT: vmovdqa 864(%rdi), %xmm0
-; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6]
-; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6]
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm12[2,3],ymm10[4,5],ymm12[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm8
+; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
+; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm4
+; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1,2,3,4,5,6,7],ymm4[8],ymm5[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm7 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm15
-; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3]
-; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm5
-; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0],ymm6[1,2,3,4,5,6,7],ymm5[8],ymm6[9,10,11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3],xmm5[4],xmm6[5,6,7]
-; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3],xmm4[4],xmm5[5,6,7]
+; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5
-; AVX2-NEXT: vmovdqa 208(%rdi), %xmm15
+; AVX2-NEXT: vmovdqa 208(%rdi), %xmm4
; AVX2-NEXT: vmovdqa 192(%rdi), %xmm3
-; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5],xmm15[6],xmm3[7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5],xmm4[6],xmm3[7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6]
; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5],mem[6],ymm6[7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7,8],ymm6[9,10,11,12,13,14],ymm7[15]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm6 = mem[0,1,2],ymm13[3],mem[4,5],ymm13[6],mem[7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1,2,3,4,5,6],ymm8[7,8],ymm6[9,10,11,12,13,14],ymm8[15]
; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm2
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm8
+; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm5[1,2,3,4,5,6,7],ymm2[8],ymm5[9,10,11,12,13,14,15]
; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
@@ -12172,39 +12151,40 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm5[1],ymm2[2,3,4,5,6,7,8],ymm5[9],ymm2[10,11,12,13,14,15]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5
-; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4],xmm2[5],xmm5[6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4,5,6,7,8],ymm5[9],ymm2[10,11,12,13,14,15]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm5 = ymm5[0,1],mem[2,3],ymm5[4,5],mem[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm5[3],xmm6[4],xmm5[5],xmm6[6,7]
+; AVX2-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm8 = mem[0],xmm15[1],mem[2,3,4,5,6,7]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
-; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
-; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5],mem[6],ymm2[7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,6,4,6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
-; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm6
-; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2,3,4,5,6,7],ymm6[8],ymm7[9,10,11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm7 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4,5,6,7,8],ymm10[9],ymm7[10,11,12,13,14,15]
-; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm9 = ymm8[0,1],mem[2,3],ymm8[4,5],mem[6,7]
+; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7]
+; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1,2,3,4],ymm8[5,6,7],ymm6[8,9,10,11,12],ymm8[13,14,15]
+; AVX2-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload
+; AVX2-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5],mem[6],ymm6[7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,6,4,6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm6
+; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
+; AVX2-NEXT: vpshufb %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1,2,3,4,5,6,7],ymm2[8],ymm8[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6,7,8],ymm9[9],ymm8[10,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm11[2,3],ymm7[4,5],ymm11[6,7]
; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm10
; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4],xmm9[5],xmm10[6,7]
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
@@ -12214,130 +12194,130 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm1 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm1 = mem[0,1,2],ymm1[3],mem[4,5],ymm1[6],mem[7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,6,4,6,7]
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7]
; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
-; AVX2-NEXT: vpshufb %ymm2, %ymm7, %ymm7
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
+; AVX2-NEXT: vpshufb %ymm6, %ymm8, %ymm8
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4,5,6,7,8],ymm7[9],ymm1[10,11,12,13,14,15]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm7 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm9
-; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4],xmm7[5],xmm9[6,7]
-; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX2-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7]
-; AVX2-NEXT: vpshufb %xmm5, %xmm7, %xmm7
+; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4,5,6,7,8],ymm8[9],ymm1[10,11,12,13,14,15]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm8 = mem[0,1],ymm7[2,3],mem[4,5],ymm7[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7]
+; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX2-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3,4,5,6,7]
+; AVX2-NEXT: vpshufb %xmm5, %xmm8, %xmm8
+; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7],ymm7[8,9,10,11,12],ymm4[13,14,15]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm8 = mem[0,1,2],ymm8[3],mem[4,5],ymm8[6],mem[7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,6,4,6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm8
+; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm7[1,2,3,4,5,6,7],ymm1[8],ymm7[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm7 = mem[0,1,2],ymm7[3],mem[4,5],ymm7[6],mem[7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,4,6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm7
-; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7
-; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4],xmm4[5],xmm7[6,7]
-; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3,4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm7 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3],xmm8[4],xmm7[5],xmm8[6,7]
+; AVX2-NEXT: vpshufb %xmm5, %xmm7, %xmm5
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5,6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm4
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7],ymm4[8,9,10,11,12],ymm3[13,14,15]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6,7]
+; AVX2-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm4 = ymm13[0],mem[1],ymm13[2,3],mem[4],ymm13[5,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7,8],ymm5[9],ymm4[10,11,12,13,14,15]
-; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,4,6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, 96(%rsi)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, 32(%rsi)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, 64(%rsi)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, (%rsi)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, 96(%rdx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, 32(%rdx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, 64(%rdx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, (%rdx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, 32(%rcx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, 96(%rcx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, 64(%rcx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, (%rcx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, 96(%r8)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, 32(%r8)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, 64(%r8)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, (%r8)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, 96(%r9)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, 32(%r9)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, (%r9)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, 64(%r9)
+; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,4,6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5
+; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, 96(%rsi)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, 32(%rsi)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, 64(%rsi)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, (%rsi)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, 96(%rdx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, 32(%rdx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, 64(%rdx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, (%rdx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, 32(%rcx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, 96(%rcx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, 64(%rcx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, (%rcx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, 96(%r8)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, 32(%r8)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, 64(%r8)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, (%r8)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, 96(%r9)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, 32(%r9)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, (%r9)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, 64(%r9)
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, 96(%rax)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, 32(%rax)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, 64(%rax)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm3, (%rax)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, 96(%rax)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, 32(%rax)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, 64(%rax)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm4, (%rax)
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-NEXT: vmovdqa %ymm1, 32(%rax)
-; AVX2-NEXT: vmovdqa %ymm2, (%rax)
+; AVX2-NEXT: vmovdqa %ymm3, (%rax)
; AVX2-NEXT: vmovdqa %ymm0, 96(%rax)
-; AVX2-NEXT: vmovdqa %ymm6, 64(%rax)
-; AVX2-NEXT: addq $1448, %rsp # imm = 0x5A8
+; AVX2-NEXT: vmovdqa %ymm2, 64(%rax)
+; AVX2-NEXT: addq $1464, %rsp # imm = 0x5B8
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -12806,10 +12786,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm6 = mem[0,1],ymm13[2],mem[3,4,5],ymm13[6],mem[7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
-; AVX2-FP-NEXT: vpshufb %ymm2, %ymm6, %ymm7
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3]
+; AVX2-FP-NEXT: vpshufb %ymm2, %ymm6, %ymm6
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5],mem[6],xmm6[7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm8[1],ymm11[2,3,4],ymm8[5],ymm11[6,7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm8
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2],xmm4[3],xmm8[4,5,6,7]
@@ -12819,8 +12798,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
; AVX2-FP-NEXT: vpshufb %ymm9, %ymm5, %ymm5
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5,6],ymm5[7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6,7],ymm7[8],ymm5[9,10,11,12,13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
@@ -12832,20 +12811,19 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload
; AVX2-FP-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4,5],mem[6],ymm6[7]
-; AVX2-FP-NEXT: vpshufb %ymm2, %ymm6, %ymm7
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm14[1],ymm10[2,3,4],ymm14[5],ymm10[6,7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm8
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm4, %xmm6, %xmm6
-; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-FP-NEXT: vpshufb %ymm2, %ymm6, %ymm6
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5],mem[6],xmm6[7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm14[1],ymm10[2,3,4],ymm14[5],ymm10[6,7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm4, %xmm7, %xmm7
+; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-FP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
; AVX2-FP-NEXT: vmovdqa %ymm9, %ymm8
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4,5,6,7],ymm7[8],ymm3[9,10,11,12,13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,6],ymm3[7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1,2,3,4,5,6,7],ymm6[8],ymm3[9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
@@ -12859,20 +12837,19 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7]
-; AVX2-FP-NEXT: vpshufb %ymm2, %ymm5, %ymm6
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7,8,9,10,11,12,13],ymm5[14],ymm6[15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
+; AVX2-FP-NEXT: vpshufb %ymm2, %ymm5, %ymm5
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,5],mem[6],xmm5[7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3]
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm7
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm5
-; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm4, %xmm6, %xmm6
+; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
; AVX2-FP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -12886,8 +12863,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm3 = ymm14[0,1],mem[2],ymm14[3,4,5],mem[6],ymm14[7]
; AVX2-FP-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7,8,9,10,11,12,13],ymm3[14],ymm2[15]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5],mem[6],xmm2[7]
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
>From d1deed9f729099c4d938c77fc4463b2e3c6d8453 Mon Sep 17 00:00:00 2001
From: Bjorn Pettersson <bjorn.a.pettersson at ericsson.com>
Date: Thu, 26 Jun 2025 12:13:28 +0200
Subject: [PATCH 4/4] [SelectionDAG] Add DoNotPoisonEltMask to
SimplifyMultipleUseDemandedBitsForTargetNode
---
llvm/include/llvm/CodeGen/TargetLowering.h | 1 +
.../CodeGen/SelectionDAG/TargetLowering.cpp | 6 +++--
llvm/lib/Target/X86/X86ISelLowering.cpp | 22 +++++++++++--------
llvm/lib/Target/X86/X86ISelLowering.h | 1 +
4 files changed, 19 insertions(+), 11 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 043cdec038f4b..d6a24e58847f5 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4388,6 +4388,7 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
/// bitwise ops etc.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(
SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+ const APInt &DoNotPoisonEltMask,
SelectionDAG &DAG, unsigned Depth) const;
/// Return true if this function can prove that \p Op is never poison
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index b787d13686c58..b51181feaa04d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1000,7 +1000,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
if (Op.getOpcode() >= ISD::BUILTIN_OP_END)
if (SDValue V = SimplifyMultipleUseDemandedBitsForTargetNode(
- Op, DemandedBits, DemandedElts | DoNotPoisonEltMask, DAG, Depth))
+ Op, DemandedBits, DemandedElts, DoNotPoisonEltMask, DAG, Depth))
return V;
break;
}
@@ -4058,12 +4058,14 @@ bool TargetLowering::SimplifyDemandedBitsForTargetNode(
Op.getOpcode() == ISD::INTRINSIC_VOID) &&
"Should use SimplifyDemandedBits if you don't know whether Op"
" is a target node!");
- computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth);
+ computeKnownBitsForTargetNode(Op, Known, DemandedElts,
+ TLO.DAG, Depth);
return false;
}
SDValue TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+ const APInt &DoNotPoisonEltMask,
SelectionDAG &DAG, unsigned Depth) const {
assert(
(Op.getOpcode() >= ISD::BUILTIN_OP_END ||
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 219c1b672417e..c73dccc229e31 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43672,9 +43672,9 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return true;
if (!DemandedElts.isAllOnes()) {
- SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS | DoNotPoisonEltMask,
+ SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS, DoNotPoisonEltMask,
TLO.DAG, Depth + 1);
- SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS | DoNotPoisonEltMask,
+ SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS, DoNotPoisonEltMask,
TLO.DAG, Depth + 1);
if (NewLHS || NewRHS) {
NewLHS = NewLHS ? NewLHS : LHS;
@@ -43796,6 +43796,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
case X86ISD::BLENDI: {
SmallVector<int, 16> BlendMask;
DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
+ // TODO: Do we really need to consider the DoNotPoisonEltMask here?
if (SDValue R = combineBlendOfPermutes(
VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
DemandedElts | DoNotPoisonEltMask, TLO.DAG, Subtarget, SDLoc(Op)))
@@ -44906,6 +44907,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+ const APInt &DoNotPoisonEltMask,
SelectionDAG &DAG, unsigned Depth) const {
int NumElts = DemandedElts.getBitWidth();
unsigned Opc = Op.getOpcode();
@@ -44919,7 +44921,8 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
MVT VecVT = Vec.getSimpleValueType();
if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
- !DemandedElts[CIdx->getZExtValue()])
+ !DemandedElts[CIdx->getZExtValue()] &&
+ !DoNotPoisonEltMask[CIdx->getZExtValue()])
return Vec;
break;
}
@@ -44954,7 +44957,7 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
- KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
+ KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts | DoNotPoisonEltMask, Depth + 1);
if (CondKnown.isNegative())
return LHS;
if (CondKnown.isNonNegative())
@@ -44966,8 +44969,8 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
- KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
- KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
+ KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts | DoNotPoisonEltMask, Depth + 1);
+ KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts | DoNotPoisonEltMask, Depth + 1);
// If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
// the (inverted) LHS bits cannot contribute to the result of the 'andn' in
@@ -44981,7 +44984,8 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
APInt ShuffleUndef, ShuffleZero;
SmallVector<int, 16> ShuffleMask;
SmallVector<SDValue, 2> ShuffleOps;
- if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
+ if (getTargetShuffleInputs(Op, DemandedElts | DoNotPoisonEltMask,
+ ShuffleOps, ShuffleMask,
ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
// If all the demanded elts are from one operand and are inline,
// then we can use the operand directly.
@@ -45000,7 +45004,7 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
APInt IdentityOp = APInt::getAllOnes(NumOps);
for (int i = 0; i != NumElts; ++i) {
int M = ShuffleMask[i];
- if (!DemandedElts[i] || ShuffleUndef[i])
+ if (!(DemandedElts[i] || DoNotPoisonEltMask[i]) || ShuffleUndef[i])
continue;
int OpIdx = M / NumElts;
int EltIdx = M % NumElts;
@@ -45021,7 +45025,7 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
}
return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
- Op, DemandedBits, DemandedElts, DAG, Depth);
+ Op, DemandedBits, DemandedElts, DoNotPoisonEltMask, DAG, Depth);
}
bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 20e0b123c0539..5cd120b47c967 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1325,6 +1325,7 @@ namespace llvm {
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(
SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+ const APInt &DoNotPoisonEltMask,
SelectionDAG &DAG, unsigned Depth) const override;
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(
More information about the llvm-branch-commits
mailing list