[llvm] r325232 - [SelectionDAG] Add initial implementation of TargetLowering::SimplifyDemandedVectorElts
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 15 04:14:15 PST 2018
Author: rksimon
Date: Thu Feb 15 04:14:15 2018
New Revision: 325232
URL: http://llvm.org/viewvc/llvm-project?rev=325232&view=rev
Log:
[SelectionDAG] Add initial implementation of TargetLowering::SimplifyDemandedVectorElts
This is mainly a move of simplifyShuffleOperands from DAGCombiner::visitVECTOR_SHUFFLE to create a more general purpose TargetLowering::SimplifyDemandedVectorElts implementation.
Further features can be moved/added in future patches.
Differential Revision: https://reviews.llvm.org/D42896
Modified:
llvm/trunk/include/llvm/CodeGen/TargetLowering.h
llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/trunk/test/CodeGen/Mips/cconv/vector.ll
llvm/trunk/test/CodeGen/X86/combine-sra.ll
llvm/trunk/test/CodeGen/X86/split-extend-vector-inreg.ll
llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
llvm/trunk/test/CodeGen/X86/sse3.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-sse1.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll
llvm/trunk/test/CodeGen/X86/vector-trunc.ll
Modified: llvm/trunk/include/llvm/CodeGen/TargetLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/TargetLowering.h?rev=325232&r1=325231&r2=325232&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/TargetLowering.h (original)
+++ llvm/trunk/include/llvm/CodeGen/TargetLowering.h Thu Feb 15 04:14:15 2018
@@ -2707,6 +2707,30 @@ public:
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask,
DAGCombinerInfo &DCI) const;
+ /// Look at Vector Op. At this point, we know that only the DemandedElts
+ /// elements of the result of Op are ever used downstream. If we can use
+ /// this information to simplify Op, create a new simplified DAG node and
+ /// return true, storing the original and new nodes in TLO.
+ /// Otherwise, analyze the expression and return a mask of KnownUndef and
+ /// KnownZero elements for the expression (used to simplify the caller).
+ /// The KnownUndef/Zero elements may only be accurate for those bits
+ /// in the DemandedMask.
+ /// \p AssumeSingleUse When this parameter is true, this function will
+ /// attempt to simplify \p Op even if there are multiple uses.
+ /// Callers are responsible for correctly updating the DAG based on the
+ /// results of this function, because simply replacing replacing TLO.Old
+ /// with TLO.New will be incorrect when this parameter is true and TLO.Old
+ /// has multiple uses.
+ bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts,
+ APInt &KnownUndef, APInt &KnownZero,
+ TargetLoweringOpt &TLO, unsigned Depth = 0,
+ bool AssumeSingleUse = false) const;
+
+ /// Helper wrapper around SimplifyDemandedVectorElts
+ bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts,
+ APInt &KnownUndef, APInt &KnownZero,
+ DAGCombinerInfo &DCI) const;
+
/// Determine which of the bits specified in Mask are known to be either zero
/// or one and return them in the KnownZero/KnownOne bitsets. The DemandedElts
/// argument allows us to only collect the known bits that are shared by the
@@ -2735,6 +2759,15 @@ public:
const SelectionDAG &DAG,
unsigned Depth = 0) const;
+ /// Attempt to simplify any target nodes based on the demanded vector
+ /// elements, returning true on success. Otherwise, analyze the expression and
+ /// return a mask of KnownUndef and KnownZero elements for the expression
+ /// (used to simplify the caller). The KnownUndef/Zero elements may only be
+ /// accurate for those bits in the DemandedMask
+ virtual bool SimplifyDemandedVectorEltsForTargetNode(
+ SDValue Op, const APInt &DemandedElts, APInt &KnownUndef,
+ APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth = 0) const;
+
struct DAGCombinerInfo {
void *DC; // The DAG Combiner object.
CombineLevel Level;
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=325232&r1=325231&r2=325232&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Thu Feb 15 04:14:15 2018
@@ -232,7 +232,17 @@ namespace {
return SimplifyDemandedBits(Op, Demanded);
}
+ /// Check the specified vector node value to see if it can be simplified or
+ /// if things it uses can be simplified as it only uses some of the
+ /// elements. If so, return true.
+ bool SimplifyDemandedVectorElts(SDValue Op) {
+ unsigned NumElts = Op.getValueType().getVectorNumElements();
+ APInt Demanded = APInt::getAllOnesValue(NumElts);
+ return SimplifyDemandedVectorElts(Op, Demanded);
+ }
+
bool SimplifyDemandedBits(SDValue Op, const APInt &Demanded);
+ bool SimplifyDemandedVectorElts(SDValue Op, const APInt &Demanded);
bool CombineToPreIndexedLoadStore(SDNode *N);
bool CombineToPostIndexedLoadStore(SDNode *N);
@@ -1085,6 +1095,28 @@ bool DAGCombiner::SimplifyDemandedBits(S
return true;
}
+/// Check the specified vector node value to see if it can be simplified or
+/// if things it uses can be simplified as it only uses some of the elements.
+/// If so, return true.
+bool DAGCombiner::SimplifyDemandedVectorElts(SDValue Op,
+ const APInt &Demanded) {
+ TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
+ APInt KnownUndef, KnownZero;
+ if (!TLI.SimplifyDemandedVectorElts(Op, Demanded, KnownUndef, KnownZero, TLO))
+ return false;
+
+ // Revisit the node.
+ AddToWorklist(Op.getNode());
+
+ // Replace the old value with the new one.
+ ++NodesCombined;
+ DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG);
+ dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG); dbgs() << '\n');
+
+ CommitTargetLoweringOpt(TLO);
+ return true;
+}
+
void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) {
SDLoc DL(Load);
EVT VT = Load->getValueType(0);
@@ -15558,92 +15590,6 @@ SDValue DAGCombiner::visitEXTRACT_SUBVEC
return SDValue();
}
-static SDValue simplifyShuffleOperandRecursively(SmallBitVector &UsedElements,
- SDValue V, SelectionDAG &DAG) {
- SDLoc DL(V);
- EVT VT = V.getValueType();
-
- switch (V.getOpcode()) {
- default:
- return V;
-
- case ISD::CONCAT_VECTORS: {
- EVT OpVT = V->getOperand(0).getValueType();
- int OpSize = OpVT.getVectorNumElements();
- SmallBitVector OpUsedElements(OpSize, false);
- bool FoundSimplification = false;
- SmallVector<SDValue, 4> NewOps;
- NewOps.reserve(V->getNumOperands());
- for (int i = 0, NumOps = V->getNumOperands(); i < NumOps; ++i) {
- SDValue Op = V->getOperand(i);
- bool OpUsed = false;
- for (int j = 0; j < OpSize; ++j)
- if (UsedElements[i * OpSize + j]) {
- OpUsedElements[j] = true;
- OpUsed = true;
- }
- NewOps.push_back(
- OpUsed ? simplifyShuffleOperandRecursively(OpUsedElements, Op, DAG)
- : DAG.getUNDEF(OpVT));
- FoundSimplification |= Op == NewOps.back();
- OpUsedElements.reset();
- }
- if (FoundSimplification)
- V = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, NewOps);
- return V;
- }
-
- case ISD::INSERT_SUBVECTOR: {
- SDValue BaseV = V->getOperand(0);
- SDValue SubV = V->getOperand(1);
- auto *IdxN = dyn_cast<ConstantSDNode>(V->getOperand(2));
- if (!IdxN)
- return V;
-
- int SubSize = SubV.getValueType().getVectorNumElements();
- int Idx = IdxN->getZExtValue();
- bool SubVectorUsed = false;
- SmallBitVector SubUsedElements(SubSize, false);
- for (int i = 0; i < SubSize; ++i)
- if (UsedElements[i + Idx]) {
- SubVectorUsed = true;
- SubUsedElements[i] = true;
- UsedElements[i + Idx] = false;
- }
-
- // Now recurse on both the base and sub vectors.
- SDValue SimplifiedSubV =
- SubVectorUsed
- ? simplifyShuffleOperandRecursively(SubUsedElements, SubV, DAG)
- : DAG.getUNDEF(SubV.getValueType());
- SDValue SimplifiedBaseV = simplifyShuffleOperandRecursively(UsedElements, BaseV, DAG);
- if (SimplifiedSubV != SubV || SimplifiedBaseV != BaseV)
- V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
- SimplifiedBaseV, SimplifiedSubV, V->getOperand(2));
- return V;
- }
- }
-}
-
-static SDValue simplifyShuffleOperands(ShuffleVectorSDNode *SVN, SDValue N0,
- SDValue N1, SelectionDAG &DAG) {
- EVT VT = SVN->getValueType(0);
- int NumElts = VT.getVectorNumElements();
- SmallBitVector N0UsedElements(NumElts, false), N1UsedElements(NumElts, false);
- for (int M : SVN->getMask())
- if (M >= 0 && M < NumElts)
- N0UsedElements[M] = true;
- else if (M >= NumElts)
- N1UsedElements[M - NumElts] = true;
-
- SDValue S0 = simplifyShuffleOperandRecursively(N0UsedElements, N0, DAG);
- SDValue S1 = simplifyShuffleOperandRecursively(N1UsedElements, N1, DAG);
- if (S0 == N0 && S1 == N1)
- return SDValue();
-
- return DAG.getVectorShuffle(VT, SDLoc(SVN), S0, S1, SVN->getMask());
-}
-
static SDValue simplifyShuffleMask(ShuffleVectorSDNode *SVN, SDValue N0,
SDValue N1, SelectionDAG &DAG) {
auto isUndefElt = [](SDValue V, int Idx) {
@@ -16181,11 +16127,9 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE
}
}
- // There are various patterns used to build up a vector from smaller vectors,
- // subvectors, or elements. Scan chains of these and replace unused insertions
- // or components with undef.
- if (SDValue S = simplifyShuffleOperands(SVN, N0, N1, DAG))
- return S;
+ // Simplify source operands based on shuffle mask.
+ if (SimplifyDemandedVectorElts(SDValue(N, 0)))
+ return SDValue(N, 0);
// Match shuffles that can be converted to any_vector_extend_in_reg.
if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations, LegalTypes))
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp?rev=325232&r1=325231&r2=325232&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp Thu Feb 15 04:14:15 2018
@@ -1279,6 +1279,197 @@ bool TargetLowering::SimplifyDemandedBit
return false;
}
+bool TargetLowering::SimplifyDemandedVectorElts(SDValue Op,
+ const APInt &DemandedElts,
+ APInt &KnownUndef,
+ APInt &KnownZero,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+
+ bool Simplified =
+ SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, TLO);
+ if (Simplified)
+ DCI.CommitTargetLoweringOpt(TLO);
+ return Simplified;
+}
+
+bool TargetLowering::SimplifyDemandedVectorElts(
+ SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef,
+ APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth,
+ bool AssumeSingleUse) const {
+ EVT VT = Op.getValueType();
+ APInt DemandedElts = DemandedEltMask;
+ unsigned NumElts = DemandedElts.getBitWidth();
+ assert(VT.isVector() && "Expected vector op");
+ assert(VT.getVectorNumElements() == NumElts &&
+ "Mask size mismatches value type element count!");
+
+ KnownUndef = KnownZero = APInt::getNullValue(NumElts);
+
+ // Undef operand.
+ if (Op.isUndef()) {
+ KnownUndef.setAllBits();
+ return false;
+ }
+
+ // If Op has other users, assume that all elements are needed.
+ if (!Op.getNode()->hasOneUse() && !AssumeSingleUse)
+ DemandedElts.setAllBits();
+
+ // Not demanding any elements from Op.
+ if (DemandedElts == 0) {
+ KnownUndef.setAllBits();
+ return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
+ }
+
+ // Limit search depth.
+ if (Depth >= 6)
+ return false;
+
+ SDLoc DL(Op);
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+
+ switch (Op.getOpcode()) {
+ case ISD::SCALAR_TO_VECTOR: {
+ if (!DemandedElts[0]) {
+ KnownUndef.setAllBits();
+ return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
+ }
+ KnownUndef.setHighBits(NumElts - 1);
+ break;
+ }
+ case ISD::BUILD_VECTOR: {
+ // Check all elements and simplify any unused elements with UNDEF.
+ if (!DemandedElts.isAllOnesValue()) {
+ // Don't simplify BROADCASTS.
+ if (llvm::any_of(Op->op_values(),
+ [&](SDValue Elt) { return Op.getOperand(0) != Elt; })) {
+ SmallVector<SDValue, 32> Ops(Op->op_begin(), Op->op_end());
+ bool Updated = false;
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (!DemandedElts[i] && !Ops[i].isUndef()) {
+ Ops[i] = TLO.DAG.getUNDEF(Ops[0].getValueType());
+ KnownUndef.setBit(i);
+ Updated = true;
+ }
+ }
+ if (Updated)
+ return TLO.CombineTo(Op, TLO.DAG.getBuildVector(VT, DL, Ops));
+ }
+ }
+ for (unsigned i = 0; i != NumElts; ++i) {
+ SDValue SrcOp = Op.getOperand(i);
+ if (SrcOp.isUndef()) {
+ KnownUndef.setBit(i);
+ } else if (EltSizeInBits == SrcOp.getScalarValueSizeInBits() &&
+ (isNullConstant(SrcOp) || isNullFPConstant(SrcOp))) {
+ KnownZero.setBit(i);
+ }
+ }
+ break;
+ }
+ case ISD::CONCAT_VECTORS: {
+ EVT SubVT = Op.getOperand(0).getValueType();
+ unsigned NumSubVecs = Op.getNumOperands();
+ unsigned NumSubElts = SubVT.getVectorNumElements();
+ for (unsigned i = 0; i != NumSubVecs; ++i) {
+ SDValue SubOp = Op.getOperand(i);
+ APInt SubElts = DemandedElts.extractBits(NumSubElts, i * NumSubElts);
+ APInt SubUndef, SubZero;
+ if (SimplifyDemandedVectorElts(SubOp, SubElts, SubUndef, SubZero, TLO,
+ Depth + 1))
+ return true;
+ KnownUndef.insertBits(SubUndef, i * NumSubElts);
+ KnownZero.insertBits(SubZero, i * NumSubElts);
+ }
+ break;
+ }
+ case ISD::INSERT_SUBVECTOR: {
+ if (!isa<ConstantSDNode>(Op.getOperand(2)))
+ break;
+ SDValue Base = Op.getOperand(0);
+ SDValue Sub = Op.getOperand(1);
+ EVT SubVT = Sub.getValueType();
+ unsigned NumSubElts = SubVT.getVectorNumElements();
+ APInt Idx = cast<ConstantSDNode>(Op.getOperand(2))->getAPIntValue();
+ if (Idx.uge(NumElts - NumSubElts))
+ break;
+ unsigned SubIdx = Idx.getZExtValue();
+ APInt SubElts = DemandedElts.extractBits(NumSubElts, SubIdx);
+ APInt SubUndef, SubZero;
+ if (SimplifyDemandedVectorElts(Sub, SubElts, SubUndef, SubZero, TLO,
+ Depth + 1))
+ return true;
+ APInt BaseElts = DemandedElts;
+ BaseElts.insertBits(APInt::getNullValue(NumSubElts), SubIdx);
+ if (SimplifyDemandedVectorElts(Base, BaseElts, KnownUndef, KnownZero, TLO,
+ Depth + 1))
+ return true;
+ KnownUndef.insertBits(SubUndef, SubIdx);
+ KnownZero.insertBits(SubZero, SubIdx);
+ break;
+ }
+ case ISD::VECTOR_SHUFFLE: {
+ ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(Op)->getMask();
+
+ // Collect demanded elements from shuffle operands..
+ APInt DemandedLHS(NumElts, 0);
+ APInt DemandedRHS(NumElts, 0);
+ for (unsigned i = 0; i != NumElts; ++i) {
+ int M = ShuffleMask[i];
+ if (M < 0 || !DemandedElts[i])
+ continue;
+ assert(0 <= M && M < (int)(2 * NumElts) && "Shuffle index out of range");
+ if (M < (int)NumElts)
+ DemandedLHS.setBit(M);
+ else
+ DemandedRHS.setBit(M - NumElts);
+ }
+
+ // See if we can simplify either shuffle operand.
+ APInt UndefLHS, ZeroLHS;
+ APInt UndefRHS, ZeroRHS;
+ if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, UndefLHS,
+ ZeroLHS, TLO, Depth + 1))
+ return true;
+ if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, UndefRHS,
+ ZeroRHS, TLO, Depth + 1))
+ return true;
+
+ // Propagate undef/zero elements from LHS/RHS.
+ for (unsigned i = 0; i != NumElts; ++i) {
+ int M = ShuffleMask[i];
+ if (M < 0) {
+ KnownUndef.setBit(i);
+ } else if (M < (int)NumElts) {
+ if (UndefLHS[M])
+ KnownUndef.setBit(i);
+ if (ZeroLHS[M])
+ KnownZero.setBit(i);
+ } else {
+ if (UndefRHS[M - NumElts])
+ KnownUndef.setBit(i);
+ if (ZeroRHS[M - NumElts])
+ KnownZero.setBit(i);
+ }
+ }
+ break;
+ }
+ default: {
+ if (Op.getOpcode() >= ISD::BUILTIN_OP_END)
+ if (SimplifyDemandedVectorEltsForTargetNode(Op, DemandedElts, KnownUndef,
+ KnownZero, TLO, Depth))
+ return true;
+ break;
+ }
+ }
+
+ assert((KnownUndef & KnownZero) == 0 && "Elements flagged as undef AND zero");
+ return false;
+}
+
/// Determine which of the bits specified in Mask are known to be either zero or
/// one and return them in the Known.
void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
@@ -1323,6 +1514,18 @@ unsigned TargetLowering::ComputeNumSignB
return 1;
}
+bool TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
+ SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
+ TargetLoweringOpt &TLO, unsigned Depth) const {
+ assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+ Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+ Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+ Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+ "Should use SimplifyDemandedVectorElts if you don't know whether Op"
+ " is a target node!");
+ return false;
+}
+
// FIXME: Ideally, this would use ISD::isConstantSplatVector(), but that must
// work with truncating build vectors and vectors with elements of less than
// 8 bits.
Modified: llvm/trunk/test/CodeGen/Mips/cconv/vector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Mips/cconv/vector.ll?rev=325232&r1=325231&r2=325232&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Mips/cconv/vector.ll (original)
+++ llvm/trunk/test/CodeGen/Mips/cconv/vector.ll Thu Feb 15 04:14:15 2018
@@ -50,40 +50,40 @@ define <2 x i8> @i8_2(<2 x i8> %a, <2 x
;
; MIPS32R5EB-LABEL: i8_2:
; MIPS32R5EB: # %bb.0:
-; MIPS32R5EB-NEXT: addiu $sp, $sp, -16
-; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 16
-; MIPS32R5EB-NEXT: sw $5, 8($sp)
-; MIPS32R5EB-NEXT: sw $4, 12($sp)
-; MIPS32R5EB-NEXT: ldi.b $w0, 0
-; MIPS32R5EB-NEXT: lbu $1, 9($sp)
-; MIPS32R5EB-NEXT: lbu $2, 8($sp)
-; MIPS32R5EB-NEXT: move.v $w1, $w0
-; MIPS32R5EB-NEXT: insert.w $w1[0], $2
-; MIPS32R5EB-NEXT: insert.w $w1[1], $1
-; MIPS32R5EB-NEXT: lbu $1, 12($sp)
-; MIPS32R5EB-NEXT: insert.w $w0[0], $1
-; MIPS32R5EB-NEXT: lbu $1, 10($sp)
-; MIPS32R5EB-NEXT: lbu $2, 13($sp)
-; MIPS32R5EB-NEXT: insert.w $w0[1], $2
-; MIPS32R5EB-NEXT: insert.w $w1[2], $1
-; MIPS32R5EB-NEXT: lbu $1, 11($sp)
-; MIPS32R5EB-NEXT: insert.w $w1[3], $1
-; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1
-; MIPS32R5EB-NEXT: lbu $1, 14($sp)
-; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177
-; MIPS32R5EB-NEXT: insert.w $w0[2], $1
-; MIPS32R5EB-NEXT: lbu $1, 15($sp)
-; MIPS32R5EB-NEXT: insert.w $w0[3], $1
+; MIPS32R5EB-NEXT: addiu $sp, $sp, -48
+; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 48
+; MIPS32R5EB-NEXT: sw $fp, 44($sp) # 4-byte Folded Spill
+; MIPS32R5EB-NEXT: .cfi_offset 30, -4
+; MIPS32R5EB-NEXT: move $fp, $sp
+; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30
+; MIPS32R5EB-NEXT: addiu $1, $zero, -16
+; MIPS32R5EB-NEXT: and $sp, $sp, $1
+; MIPS32R5EB-NEXT: sw $5, 36($sp)
+; MIPS32R5EB-NEXT: sw $4, 40($sp)
+; MIPS32R5EB-NEXT: lbu $1, 37($sp)
+; MIPS32R5EB-NEXT: sw $1, 20($sp)
+; MIPS32R5EB-NEXT: lbu $1, 36($sp)
+; MIPS32R5EB-NEXT: sw $1, 16($sp)
+; MIPS32R5EB-NEXT: lbu $1, 40($sp)
+; MIPS32R5EB-NEXT: lbu $2, 41($sp)
+; MIPS32R5EB-NEXT: sw $2, 4($sp)
+; MIPS32R5EB-NEXT: sw $1, 0($sp)
+; MIPS32R5EB-NEXT: ld.w $w0, 16($sp)
; MIPS32R5EB-NEXT: ilvr.w $w0, $w0, $w0
; MIPS32R5EB-NEXT: shf.w $w0, $w0, 177
-; MIPS32R5EB-NEXT: addv.d $w0, $w0, $w1
+; MIPS32R5EB-NEXT: ld.w $w1, 0($sp)
+; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1
+; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177
+; MIPS32R5EB-NEXT: addv.d $w0, $w1, $w0
; MIPS32R5EB-NEXT: shf.w $w0, $w0, 177
; MIPS32R5EB-NEXT: copy_s.w $1, $w0[1]
; MIPS32R5EB-NEXT: copy_s.w $2, $w0[3]
-; MIPS32R5EB-NEXT: sb $2, 5($sp)
-; MIPS32R5EB-NEXT: sb $1, 4($sp)
-; MIPS32R5EB-NEXT: lhu $2, 4($sp)
-; MIPS32R5EB-NEXT: addiu $sp, $sp, 16
+; MIPS32R5EB-NEXT: sb $2, 33($sp)
+; MIPS32R5EB-NEXT: sb $1, 32($sp)
+; MIPS32R5EB-NEXT: lhu $2, 32($sp)
+; MIPS32R5EB-NEXT: move $sp, $fp
+; MIPS32R5EB-NEXT: lw $fp, 44($sp) # 4-byte Folded Reload
+; MIPS32R5EB-NEXT: addiu $sp, $sp, 48
; MIPS32R5EB-NEXT: jr $ra
; MIPS32R5EB-NEXT: nop
;
@@ -179,37 +179,37 @@ define <2 x i8> @i8_2(<2 x i8> %a, <2 x
;
; MIPS32R5EL-LABEL: i8_2:
; MIPS32R5EL: # %bb.0:
-; MIPS32R5EL-NEXT: addiu $sp, $sp, -16
-; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 16
-; MIPS32R5EL-NEXT: sw $5, 8($sp)
-; MIPS32R5EL-NEXT: sw $4, 12($sp)
-; MIPS32R5EL-NEXT: ldi.b $w0, 0
-; MIPS32R5EL-NEXT: lbu $1, 9($sp)
-; MIPS32R5EL-NEXT: lbu $2, 12($sp)
-; MIPS32R5EL-NEXT: lbu $3, 8($sp)
-; MIPS32R5EL-NEXT: move.v $w1, $w0
-; MIPS32R5EL-NEXT: insert.w $w1[0], $3
-; MIPS32R5EL-NEXT: insert.w $w0[0], $2
-; MIPS32R5EL-NEXT: insert.w $w1[1], $1
-; MIPS32R5EL-NEXT: lbu $1, 10($sp)
-; MIPS32R5EL-NEXT: insert.w $w1[2], $1
-; MIPS32R5EL-NEXT: lbu $1, 11($sp)
-; MIPS32R5EL-NEXT: insert.w $w1[3], $1
-; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1
-; MIPS32R5EL-NEXT: lbu $1, 13($sp)
-; MIPS32R5EL-NEXT: insert.w $w0[1], $1
-; MIPS32R5EL-NEXT: lbu $1, 14($sp)
-; MIPS32R5EL-NEXT: insert.w $w0[2], $1
-; MIPS32R5EL-NEXT: lbu $1, 15($sp)
-; MIPS32R5EL-NEXT: insert.w $w0[3], $1
+; MIPS32R5EL-NEXT: addiu $sp, $sp, -48
+; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 48
+; MIPS32R5EL-NEXT: sw $fp, 44($sp) # 4-byte Folded Spill
+; MIPS32R5EL-NEXT: .cfi_offset 30, -4
+; MIPS32R5EL-NEXT: move $fp, $sp
+; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30
+; MIPS32R5EL-NEXT: addiu $1, $zero, -16
+; MIPS32R5EL-NEXT: and $sp, $sp, $1
+; MIPS32R5EL-NEXT: sw $5, 36($sp)
+; MIPS32R5EL-NEXT: sw $4, 40($sp)
+; MIPS32R5EL-NEXT: lbu $1, 37($sp)
+; MIPS32R5EL-NEXT: sw $1, 20($sp)
+; MIPS32R5EL-NEXT: lbu $1, 36($sp)
+; MIPS32R5EL-NEXT: sw $1, 16($sp)
+; MIPS32R5EL-NEXT: lbu $1, 41($sp)
+; MIPS32R5EL-NEXT: sw $1, 4($sp)
+; MIPS32R5EL-NEXT: lbu $1, 40($sp)
+; MIPS32R5EL-NEXT: sw $1, 0($sp)
+; MIPS32R5EL-NEXT: ld.w $w0, 16($sp)
; MIPS32R5EL-NEXT: ilvr.w $w0, $w0, $w0
-; MIPS32R5EL-NEXT: addv.d $w0, $w0, $w1
+; MIPS32R5EL-NEXT: ld.w $w1, 0($sp)
+; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1
+; MIPS32R5EL-NEXT: addv.d $w0, $w1, $w0
; MIPS32R5EL-NEXT: copy_s.w $1, $w0[0]
; MIPS32R5EL-NEXT: copy_s.w $2, $w0[2]
-; MIPS32R5EL-NEXT: sb $2, 5($sp)
-; MIPS32R5EL-NEXT: sb $1, 4($sp)
-; MIPS32R5EL-NEXT: lhu $2, 4($sp)
-; MIPS32R5EL-NEXT: addiu $sp, $sp, 16
+; MIPS32R5EL-NEXT: sb $2, 33($sp)
+; MIPS32R5EL-NEXT: sb $1, 32($sp)
+; MIPS32R5EL-NEXT: lhu $2, 32($sp)
+; MIPS32R5EL-NEXT: move $sp, $fp
+; MIPS32R5EL-NEXT: lw $fp, 44($sp) # 4-byte Folded Reload
+; MIPS32R5EL-NEXT: addiu $sp, $sp, 48
; MIPS32R5EL-NEXT: jr $ra
; MIPS32R5EL-NEXT: nop
;
@@ -364,102 +364,82 @@ define <2 x i8> @i8x2_7(<2 x i8> %a, <2
;
; MIPS32R5EB-LABEL: i8x2_7:
; MIPS32R5EB: # %bb.0: # %entry
-; MIPS32R5EB-NEXT: addiu $sp, $sp, -24
-; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 24
-; MIPS32R5EB-NEXT: sw $5, 16($sp)
-; MIPS32R5EB-NEXT: sw $4, 20($sp)
-; MIPS32R5EB-NEXT: ldi.b $w0, 0
-; MIPS32R5EB-NEXT: lbu $1, 17($sp)
-; MIPS32R5EB-NEXT: lbu $2, 16($sp)
-; MIPS32R5EB-NEXT: move.v $w1, $w0
-; MIPS32R5EB-NEXT: insert.w $w1[0], $2
-; MIPS32R5EB-NEXT: insert.w $w1[1], $1
-; MIPS32R5EB-NEXT: lbu $1, 18($sp)
-; MIPS32R5EB-NEXT: lbu $2, 21($sp)
-; MIPS32R5EB-NEXT: lbu $3, 20($sp)
-; MIPS32R5EB-NEXT: move.v $w2, $w0
-; MIPS32R5EB-NEXT: insert.w $w2[0], $3
-; MIPS32R5EB-NEXT: insert.w $w2[1], $2
-; MIPS32R5EB-NEXT: insert.w $w1[2], $1
-; MIPS32R5EB-NEXT: lbu $1, 19($sp)
-; MIPS32R5EB-NEXT: insert.w $w1[3], $1
-; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1
-; MIPS32R5EB-NEXT: lbu $1, 22($sp)
-; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177
-; MIPS32R5EB-NEXT: insert.w $w2[2], $1
-; MIPS32R5EB-NEXT: lbu $1, 23($sp)
-; MIPS32R5EB-NEXT: insert.w $w2[3], $1
-; MIPS32R5EB-NEXT: ilvr.w $w2, $w2, $w2
-; MIPS32R5EB-NEXT: shf.w $w2, $w2, 177
-; MIPS32R5EB-NEXT: addv.d $w1, $w2, $w1
-; MIPS32R5EB-NEXT: sw $6, 12($sp)
-; MIPS32R5EB-NEXT: lbu $1, 13($sp)
-; MIPS32R5EB-NEXT: lbu $2, 12($sp)
-; MIPS32R5EB-NEXT: move.v $w2, $w0
-; MIPS32R5EB-NEXT: insert.w $w2[0], $2
-; MIPS32R5EB-NEXT: insert.w $w2[1], $1
-; MIPS32R5EB-NEXT: lbu $1, 14($sp)
-; MIPS32R5EB-NEXT: insert.w $w2[2], $1
-; MIPS32R5EB-NEXT: lbu $1, 15($sp)
-; MIPS32R5EB-NEXT: insert.w $w2[3], $1
-; MIPS32R5EB-NEXT: ilvr.w $w2, $w2, $w2
-; MIPS32R5EB-NEXT: shf.w $w2, $w2, 177
-; MIPS32R5EB-NEXT: addv.d $w1, $w1, $w2
-; MIPS32R5EB-NEXT: sw $7, 8($sp)
-; MIPS32R5EB-NEXT: lbu $1, 9($sp)
-; MIPS32R5EB-NEXT: lbu $2, 8($sp)
-; MIPS32R5EB-NEXT: move.v $w2, $w0
-; MIPS32R5EB-NEXT: insert.w $w2[0], $2
-; MIPS32R5EB-NEXT: insert.w $w2[1], $1
-; MIPS32R5EB-NEXT: lbu $1, 10($sp)
-; MIPS32R5EB-NEXT: insert.w $w2[2], $1
-; MIPS32R5EB-NEXT: lbu $1, 11($sp)
-; MIPS32R5EB-NEXT: insert.w $w2[3], $1
-; MIPS32R5EB-NEXT: ilvr.w $w2, $w2, $w2
-; MIPS32R5EB-NEXT: shf.w $w2, $w2, 177
-; MIPS32R5EB-NEXT: addv.d $w1, $w1, $w2
-; MIPS32R5EB-NEXT: lbu $1, 41($sp)
-; MIPS32R5EB-NEXT: lbu $2, 40($sp)
-; MIPS32R5EB-NEXT: move.v $w2, $w0
-; MIPS32R5EB-NEXT: insert.w $w2[0], $2
-; MIPS32R5EB-NEXT: insert.w $w2[1], $1
-; MIPS32R5EB-NEXT: lbu $1, 42($sp)
-; MIPS32R5EB-NEXT: insert.w $w2[2], $1
-; MIPS32R5EB-NEXT: lbu $1, 43($sp)
-; MIPS32R5EB-NEXT: insert.w $w2[3], $1
-; MIPS32R5EB-NEXT: ilvr.w $w2, $w2, $w2
-; MIPS32R5EB-NEXT: shf.w $w2, $w2, 177
-; MIPS32R5EB-NEXT: addv.d $w1, $w1, $w2
-; MIPS32R5EB-NEXT: lbu $1, 45($sp)
-; MIPS32R5EB-NEXT: lbu $2, 44($sp)
-; MIPS32R5EB-NEXT: move.v $w2, $w0
-; MIPS32R5EB-NEXT: insert.w $w2[0], $2
-; MIPS32R5EB-NEXT: insert.w $w2[1], $1
-; MIPS32R5EB-NEXT: lbu $1, 46($sp)
-; MIPS32R5EB-NEXT: insert.w $w2[2], $1
-; MIPS32R5EB-NEXT: lbu $1, 47($sp)
-; MIPS32R5EB-NEXT: insert.w $w2[3], $1
-; MIPS32R5EB-NEXT: ilvr.w $w2, $w2, $w2
-; MIPS32R5EB-NEXT: shf.w $w2, $w2, 177
-; MIPS32R5EB-NEXT: addv.d $w1, $w1, $w2
-; MIPS32R5EB-NEXT: lbu $1, 48($sp)
-; MIPS32R5EB-NEXT: insert.w $w0[0], $1
-; MIPS32R5EB-NEXT: lbu $1, 49($sp)
-; MIPS32R5EB-NEXT: insert.w $w0[1], $1
-; MIPS32R5EB-NEXT: lbu $1, 50($sp)
-; MIPS32R5EB-NEXT: insert.w $w0[2], $1
-; MIPS32R5EB-NEXT: lbu $1, 51($sp)
-; MIPS32R5EB-NEXT: insert.w $w0[3], $1
+; MIPS32R5EB-NEXT: addiu $sp, $sp, -144
+; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 144
+; MIPS32R5EB-NEXT: sw $fp, 140($sp) # 4-byte Folded Spill
+; MIPS32R5EB-NEXT: .cfi_offset 30, -4
+; MIPS32R5EB-NEXT: move $fp, $sp
+; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30
+; MIPS32R5EB-NEXT: addiu $1, $zero, -16
+; MIPS32R5EB-NEXT: and $sp, $sp, $1
+; MIPS32R5EB-NEXT: sw $5, 132($sp)
+; MIPS32R5EB-NEXT: sw $4, 136($sp)
+; MIPS32R5EB-NEXT: lbu $1, 133($sp)
+; MIPS32R5EB-NEXT: sw $1, 68($sp)
+; MIPS32R5EB-NEXT: lbu $1, 132($sp)
+; MIPS32R5EB-NEXT: sw $1, 64($sp)
+; MIPS32R5EB-NEXT: lbu $1, 136($sp)
+; MIPS32R5EB-NEXT: lbu $2, 137($sp)
+; MIPS32R5EB-NEXT: sw $2, 52($sp)
+; MIPS32R5EB-NEXT: sw $1, 48($sp)
+; MIPS32R5EB-NEXT: ld.w $w0, 64($sp)
; MIPS32R5EB-NEXT: ilvr.w $w0, $w0, $w0
; MIPS32R5EB-NEXT: shf.w $w0, $w0, 177
+; MIPS32R5EB-NEXT: ld.w $w1, 48($sp)
+; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1
+; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177
; MIPS32R5EB-NEXT: addv.d $w0, $w1, $w0
+; MIPS32R5EB-NEXT: sw $6, 128($sp)
+; MIPS32R5EB-NEXT: lbu $1, 129($sp)
+; MIPS32R5EB-NEXT: sw $1, 84($sp)
+; MIPS32R5EB-NEXT: lbu $1, 128($sp)
+; MIPS32R5EB-NEXT: sw $1, 80($sp)
+; MIPS32R5EB-NEXT: ld.w $w1, 80($sp)
+; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1
+; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177
+; MIPS32R5EB-NEXT: addv.d $w0, $w0, $w1
+; MIPS32R5EB-NEXT: sw $7, 124($sp)
+; MIPS32R5EB-NEXT: lbu $1, 125($sp)
+; MIPS32R5EB-NEXT: sw $1, 100($sp)
+; MIPS32R5EB-NEXT: lbu $1, 124($sp)
+; MIPS32R5EB-NEXT: sw $1, 96($sp)
+; MIPS32R5EB-NEXT: ld.w $w1, 96($sp)
+; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1
+; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177
+; MIPS32R5EB-NEXT: addv.d $w0, $w0, $w1
+; MIPS32R5EB-NEXT: lbu $1, 161($fp)
+; MIPS32R5EB-NEXT: sw $1, 4($sp)
+; MIPS32R5EB-NEXT: lbu $1, 160($fp)
+; MIPS32R5EB-NEXT: sw $1, 0($sp)
+; MIPS32R5EB-NEXT: ld.w $w1, 0($sp)
+; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1
+; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177
+; MIPS32R5EB-NEXT: addv.d $w0, $w0, $w1
+; MIPS32R5EB-NEXT: lbu $1, 165($fp)
+; MIPS32R5EB-NEXT: sw $1, 20($sp)
+; MIPS32R5EB-NEXT: lbu $1, 164($fp)
+; MIPS32R5EB-NEXT: sw $1, 16($sp)
+; MIPS32R5EB-NEXT: ld.w $w1, 16($sp)
+; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1
+; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177
+; MIPS32R5EB-NEXT: addv.d $w0, $w0, $w1
+; MIPS32R5EB-NEXT: lbu $1, 169($fp)
+; MIPS32R5EB-NEXT: sw $1, 36($sp)
+; MIPS32R5EB-NEXT: lbu $1, 168($fp)
+; MIPS32R5EB-NEXT: sw $1, 32($sp)
+; MIPS32R5EB-NEXT: ld.w $w1, 32($sp)
+; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1
+; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177
+; MIPS32R5EB-NEXT: addv.d $w0, $w0, $w1
; MIPS32R5EB-NEXT: shf.w $w0, $w0, 177
; MIPS32R5EB-NEXT: copy_s.w $1, $w0[1]
; MIPS32R5EB-NEXT: copy_s.w $2, $w0[3]
-; MIPS32R5EB-NEXT: sb $2, 5($sp)
-; MIPS32R5EB-NEXT: sb $1, 4($sp)
-; MIPS32R5EB-NEXT: lhu $2, 4($sp)
-; MIPS32R5EB-NEXT: addiu $sp, $sp, 24
+; MIPS32R5EB-NEXT: sb $2, 121($sp)
+; MIPS32R5EB-NEXT: sb $1, 120($sp)
+; MIPS32R5EB-NEXT: lhu $2, 120($sp)
+; MIPS32R5EB-NEXT: move $sp, $fp
+; MIPS32R5EB-NEXT: lw $fp, 140($sp) # 4-byte Folded Reload
+; MIPS32R5EB-NEXT: addiu $sp, $sp, 144
; MIPS32R5EB-NEXT: jr $ra
; MIPS32R5EB-NEXT: nop
;
@@ -720,94 +700,74 @@ define <2 x i8> @i8x2_7(<2 x i8> %a, <2
;
; MIPS32R5EL-LABEL: i8x2_7:
; MIPS32R5EL: # %bb.0: # %entry
-; MIPS32R5EL-NEXT: addiu $sp, $sp, -24
-; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 24
-; MIPS32R5EL-NEXT: sw $5, 16($sp)
-; MIPS32R5EL-NEXT: ldi.b $w0, 0
-; MIPS32R5EL-NEXT: sw $4, 20($sp)
-; MIPS32R5EL-NEXT: lbu $1, 17($sp)
-; MIPS32R5EL-NEXT: lbu $2, 16($sp)
-; MIPS32R5EL-NEXT: move.v $w1, $w0
-; MIPS32R5EL-NEXT: insert.w $w1[0], $2
-; MIPS32R5EL-NEXT: insert.w $w1[1], $1
-; MIPS32R5EL-NEXT: lbu $1, 18($sp)
-; MIPS32R5EL-NEXT: insert.w $w1[2], $1
-; MIPS32R5EL-NEXT: lbu $1, 19($sp)
-; MIPS32R5EL-NEXT: insert.w $w1[3], $1
-; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1
-; MIPS32R5EL-NEXT: lbu $1, 21($sp)
-; MIPS32R5EL-NEXT: lbu $2, 20($sp)
-; MIPS32R5EL-NEXT: move.v $w2, $w0
-; MIPS32R5EL-NEXT: insert.w $w2[0], $2
-; MIPS32R5EL-NEXT: insert.w $w2[1], $1
-; MIPS32R5EL-NEXT: lbu $1, 22($sp)
-; MIPS32R5EL-NEXT: insert.w $w2[2], $1
-; MIPS32R5EL-NEXT: lbu $1, 23($sp)
-; MIPS32R5EL-NEXT: insert.w $w2[3], $1
-; MIPS32R5EL-NEXT: ilvr.w $w2, $w2, $w2
-; MIPS32R5EL-NEXT: addv.d $w1, $w2, $w1
-; MIPS32R5EL-NEXT: sw $6, 12($sp)
-; MIPS32R5EL-NEXT: lbu $1, 13($sp)
-; MIPS32R5EL-NEXT: lbu $2, 12($sp)
-; MIPS32R5EL-NEXT: move.v $w2, $w0
-; MIPS32R5EL-NEXT: insert.w $w2[0], $2
-; MIPS32R5EL-NEXT: insert.w $w2[1], $1
-; MIPS32R5EL-NEXT: lbu $1, 14($sp)
-; MIPS32R5EL-NEXT: insert.w $w2[2], $1
-; MIPS32R5EL-NEXT: lbu $1, 15($sp)
-; MIPS32R5EL-NEXT: insert.w $w2[3], $1
-; MIPS32R5EL-NEXT: ilvr.w $w2, $w2, $w2
-; MIPS32R5EL-NEXT: addv.d $w1, $w1, $w2
-; MIPS32R5EL-NEXT: sw $7, 8($sp)
-; MIPS32R5EL-NEXT: lbu $1, 9($sp)
-; MIPS32R5EL-NEXT: lbu $2, 8($sp)
-; MIPS32R5EL-NEXT: move.v $w2, $w0
-; MIPS32R5EL-NEXT: insert.w $w2[0], $2
-; MIPS32R5EL-NEXT: insert.w $w2[1], $1
-; MIPS32R5EL-NEXT: lbu $1, 10($sp)
-; MIPS32R5EL-NEXT: insert.w $w2[2], $1
-; MIPS32R5EL-NEXT: lbu $1, 11($sp)
-; MIPS32R5EL-NEXT: insert.w $w2[3], $1
-; MIPS32R5EL-NEXT: ilvr.w $w2, $w2, $w2
-; MIPS32R5EL-NEXT: addv.d $w1, $w1, $w2
-; MIPS32R5EL-NEXT: lbu $1, 41($sp)
-; MIPS32R5EL-NEXT: lbu $2, 40($sp)
-; MIPS32R5EL-NEXT: move.v $w2, $w0
-; MIPS32R5EL-NEXT: insert.w $w2[0], $2
-; MIPS32R5EL-NEXT: insert.w $w2[1], $1
-; MIPS32R5EL-NEXT: lbu $1, 42($sp)
-; MIPS32R5EL-NEXT: insert.w $w2[2], $1
-; MIPS32R5EL-NEXT: lbu $1, 43($sp)
-; MIPS32R5EL-NEXT: insert.w $w2[3], $1
-; MIPS32R5EL-NEXT: ilvr.w $w2, $w2, $w2
-; MIPS32R5EL-NEXT: addv.d $w1, $w1, $w2
-; MIPS32R5EL-NEXT: lbu $1, 45($sp)
-; MIPS32R5EL-NEXT: lbu $2, 44($sp)
-; MIPS32R5EL-NEXT: move.v $w2, $w0
-; MIPS32R5EL-NEXT: insert.w $w2[0], $2
-; MIPS32R5EL-NEXT: insert.w $w2[1], $1
-; MIPS32R5EL-NEXT: lbu $1, 46($sp)
-; MIPS32R5EL-NEXT: insert.w $w2[2], $1
-; MIPS32R5EL-NEXT: lbu $1, 47($sp)
-; MIPS32R5EL-NEXT: insert.w $w2[3], $1
-; MIPS32R5EL-NEXT: ilvr.w $w2, $w2, $w2
-; MIPS32R5EL-NEXT: addv.d $w1, $w1, $w2
-; MIPS32R5EL-NEXT: lbu $1, 48($sp)
-; MIPS32R5EL-NEXT: insert.w $w0[0], $1
-; MIPS32R5EL-NEXT: lbu $1, 49($sp)
-; MIPS32R5EL-NEXT: insert.w $w0[1], $1
-; MIPS32R5EL-NEXT: lbu $1, 50($sp)
-; MIPS32R5EL-NEXT: insert.w $w0[2], $1
-; MIPS32R5EL-NEXT: lbu $1, 51($sp)
-; MIPS32R5EL-NEXT: insert.w $w0[3], $1
+; MIPS32R5EL-NEXT: addiu $sp, $sp, -144
+; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 144
+; MIPS32R5EL-NEXT: sw $fp, 140($sp) # 4-byte Folded Spill
+; MIPS32R5EL-NEXT: .cfi_offset 30, -4
+; MIPS32R5EL-NEXT: move $fp, $sp
+; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30
+; MIPS32R5EL-NEXT: addiu $1, $zero, -16
+; MIPS32R5EL-NEXT: and $sp, $sp, $1
+; MIPS32R5EL-NEXT: sw $5, 132($sp)
+; MIPS32R5EL-NEXT: sw $4, 136($sp)
+; MIPS32R5EL-NEXT: lbu $1, 133($sp)
+; MIPS32R5EL-NEXT: sw $1, 68($sp)
+; MIPS32R5EL-NEXT: lbu $1, 132($sp)
+; MIPS32R5EL-NEXT: sw $1, 64($sp)
+; MIPS32R5EL-NEXT: lbu $1, 137($sp)
+; MIPS32R5EL-NEXT: sw $1, 52($sp)
+; MIPS32R5EL-NEXT: lbu $1, 136($sp)
+; MIPS32R5EL-NEXT: sw $1, 48($sp)
+; MIPS32R5EL-NEXT: ld.w $w0, 64($sp)
; MIPS32R5EL-NEXT: ilvr.w $w0, $w0, $w0
+; MIPS32R5EL-NEXT: ld.w $w1, 48($sp)
+; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1
; MIPS32R5EL-NEXT: addv.d $w0, $w1, $w0
+; MIPS32R5EL-NEXT: sw $6, 128($sp)
+; MIPS32R5EL-NEXT: lbu $1, 129($sp)
+; MIPS32R5EL-NEXT: sw $1, 84($sp)
+; MIPS32R5EL-NEXT: lbu $1, 128($sp)
+; MIPS32R5EL-NEXT: sw $1, 80($sp)
+; MIPS32R5EL-NEXT: ld.w $w1, 80($sp)
+; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1
+; MIPS32R5EL-NEXT: addv.d $w0, $w0, $w1
+; MIPS32R5EL-NEXT: sw $7, 124($sp)
+; MIPS32R5EL-NEXT: lbu $1, 125($sp)
+; MIPS32R5EL-NEXT: sw $1, 100($sp)
+; MIPS32R5EL-NEXT: lbu $1, 124($sp)
+; MIPS32R5EL-NEXT: sw $1, 96($sp)
+; MIPS32R5EL-NEXT: ld.w $w1, 96($sp)
+; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1
+; MIPS32R5EL-NEXT: addv.d $w0, $w0, $w1
+; MIPS32R5EL-NEXT: lbu $1, 161($fp)
+; MIPS32R5EL-NEXT: sw $1, 4($sp)
+; MIPS32R5EL-NEXT: lbu $1, 160($fp)
+; MIPS32R5EL-NEXT: sw $1, 0($sp)
+; MIPS32R5EL-NEXT: ld.w $w1, 0($sp)
+; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1
+; MIPS32R5EL-NEXT: addv.d $w0, $w0, $w1
+; MIPS32R5EL-NEXT: lbu $1, 165($fp)
+; MIPS32R5EL-NEXT: sw $1, 20($sp)
+; MIPS32R5EL-NEXT: lbu $1, 164($fp)
+; MIPS32R5EL-NEXT: sw $1, 16($sp)
+; MIPS32R5EL-NEXT: ld.w $w1, 16($sp)
+; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1
+; MIPS32R5EL-NEXT: addv.d $w0, $w0, $w1
+; MIPS32R5EL-NEXT: lbu $1, 169($fp)
+; MIPS32R5EL-NEXT: sw $1, 36($sp)
+; MIPS32R5EL-NEXT: lbu $1, 168($fp)
+; MIPS32R5EL-NEXT: sw $1, 32($sp)
+; MIPS32R5EL-NEXT: ld.w $w1, 32($sp)
+; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1
+; MIPS32R5EL-NEXT: addv.d $w0, $w0, $w1
; MIPS32R5EL-NEXT: copy_s.w $1, $w0[0]
; MIPS32R5EL-NEXT: copy_s.w $2, $w0[2]
-; MIPS32R5EL-NEXT: sb $2, 5($sp)
-; MIPS32R5EL-NEXT: sb $1, 4($sp)
-; MIPS32R5EL-NEXT: lhu $2, 4($sp)
-; MIPS32R5EL-NEXT: addiu $sp, $sp, 24
+; MIPS32R5EL-NEXT: sb $2, 121($sp)
+; MIPS32R5EL-NEXT: sb $1, 120($sp)
+; MIPS32R5EL-NEXT: lhu $2, 120($sp)
+; MIPS32R5EL-NEXT: move $sp, $fp
+; MIPS32R5EL-NEXT: lw $fp, 140($sp) # 4-byte Folded Reload
+; MIPS32R5EL-NEXT: addiu $sp, $sp, 144
; MIPS32R5EL-NEXT: jr $ra
; MIPS32R5EL-NEXT: nop
;
Modified: llvm/trunk/test/CodeGen/X86/combine-sra.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/combine-sra.ll?rev=325232&r1=325231&r2=325232&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/combine-sra.ll (original)
+++ llvm/trunk/test/CodeGen/X86/combine-sra.ll Thu Feb 15 04:14:15 2018
@@ -239,10 +239,7 @@ define <4 x i32> @combine_vec_ashr_trunc
define <4 x i32> @combine_vec_ashr_trunc_ashr(<4 x i64> %x) {
; SSE-LABEL: combine_vec_ashr_trunc_ashr:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE-NEXT: psrad $31, %xmm1
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: psrad $2, %xmm1
Modified: llvm/trunk/test/CodeGen/X86/split-extend-vector-inreg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/split-extend-vector-inreg.ll?rev=325232&r1=325231&r2=325232&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/split-extend-vector-inreg.ll (original)
+++ llvm/trunk/test/CodeGen/X86/split-extend-vector-inreg.ll Thu Feb 15 04:14:15 2018
@@ -20,10 +20,7 @@ define <4 x i64> @autogen_SD88863() {
;
; X64-LABEL: autogen_SD88863:
; X64: # %bb.0: # %BB
-; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: movb $1, %al
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB0_1: # %CF
@@ -31,6 +28,9 @@ define <4 x i64> @autogen_SD88863() {
; X64-NEXT: testb %al, %al
; X64-NEXT: jne .LBB0_1
; X64-NEXT: # %bb.2: # %CF240
+; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; X64-NEXT: retq
BB:
%I26 = insertelement <4 x i64> undef, i64 undef, i32 2
Modified: llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll?rev=325232&r1=325231&r2=325232&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll Thu Feb 15 04:14:15 2018
@@ -917,8 +917,6 @@ define <4 x float> @test_mm_loadh_pi(<4
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X64-NEXT: xorps %xmm2, %xmm2
-; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: retq
%ptr = bitcast x86_mmx* %a1 to <2 x float>*
@@ -948,8 +946,6 @@ define <4 x float> @test_mm_loadl_pi(<4
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X64-NEXT: xorps %xmm2, %xmm2
-; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; X64-NEXT: movaps %xmm1, %xmm0
; X64-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/sse3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse3.ll?rev=325232&r1=325231&r2=325232&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse3.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse3.ll Thu Feb 15 04:14:15 2018
@@ -379,16 +379,12 @@ entry:
define <16 x i8> @t16(<16 x i8> %T0) nounwind readnone {
; X86-LABEL: t16:
; X86: # %bb.0: # %entry
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0]
-; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X86-NEXT: movdqa %xmm1, %xmm0
+; X86-NEXT: pslld $16, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: t16:
; X64: # %bb.0: # %entry
-; X64-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0]
-; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: pslld $16, %xmm0
; X64-NEXT: retq
entry:
%tmp8 = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll?rev=325232&r1=325231&r2=325232&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll Thu Feb 15 04:14:15 2018
@@ -511,10 +511,9 @@ define <8 x float> @expand14(<4 x float>
;
; KNL64-LABEL: expand14:
; KNL64: # %bb.0:
-; KNL64-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,3,0,0]
-; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
; KNL64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; KNL64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; KNL64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
; KNL64-NEXT: retq
;
@@ -528,10 +527,9 @@ define <8 x float> @expand14(<4 x float>
;
; KNL32-LABEL: expand14:
; KNL32: # %bb.0:
-; KNL32-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,3,0,0]
-; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
; KNL32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; KNL32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; KNL32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
; KNL32-NEXT: retl
%addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll?rev=325232&r1=325231&r2=325232&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll Thu Feb 15 04:14:15 2018
@@ -985,9 +985,8 @@ define internal fastcc <8 x float> @PR34
;
; X32-AVX512-LABEL: PR34577:
; X32-AVX512: # %bb.0: # %entry
-; X32-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = <1,u,u,u,2,u,5,0>
-; X32-AVX512-NEXT: vpermps %ymm0, %ymm2, %ymm0
; X32-AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X32-AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; X32-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; X32-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
; X32-AVX512-NEXT: vpermps %ymm1, %ymm2, %ymm1
@@ -1006,9 +1005,8 @@ define internal fastcc <8 x float> @PR34
;
; X64-AVX512-LABEL: PR34577:
; X64-AVX512: # %bb.0: # %entry
-; X64-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = <1,u,u,u,2,u,5,0>
-; X64-AVX512-NEXT: vpermps %ymm0, %ymm2, %ymm0
; X64-AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X64-AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; X64-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
; X64-AVX512-NEXT: vpermps %ymm1, %ymm2, %ymm1
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-sse1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-sse1.ll?rev=325232&r1=325231&r2=325232&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-sse1.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-sse1.ll Thu Feb 15 04:14:15 2018
@@ -237,8 +237,6 @@ define <4 x float> @insert_mem_lo_v4f32(
; SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE1-NEXT: xorps %xmm2, %xmm2
-; SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; SSE1-NEXT: movaps %xmm1, %xmm0
; SSE1-NEXT: retq
@@ -258,8 +256,6 @@ define <4 x float> @insert_mem_hi_v4f32(
; SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE1-NEXT: xorps %xmm2, %xmm2
-; SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE1-NEXT: retq
%a = load <2 x float>, <2 x float>* %ptr
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll?rev=325232&r1=325231&r2=325232&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll Thu Feb 15 04:14:15 2018
@@ -710,7 +710,7 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1]
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,18446744073709551615,18446744073709551615,0,0]
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,0,0,0,0]
; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
; AVX512F-NEXT: kmovw %k0, %eax
Modified: llvm/trunk/test/CodeGen/X86/vector-trunc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc.ll?rev=325232&r1=325231&r2=325232&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc.ll Thu Feb 15 04:14:15 2018
@@ -54,61 +54,19 @@ entry:
}
define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) {
-; SSE2-LABEL: trunc8i64_8i32_ashr:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[0,2]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm4[0,2]
-; SSE2-NEXT: movaps %xmm2, %xmm1
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc8i64_8i32_ashr:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3]
-; SSSE3-NEXT: psrad $31, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; SSSE3-NEXT: psrad $31, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[0,2]
-; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm4[0,2]
-; SSSE3-NEXT: movaps %xmm2, %xmm1
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc8i64_8i32_ashr:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE41-NEXT: psrad $31, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3],xmm4[4,5],xmm1[6,7]
-; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
-; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[0,2]
-; SSE41-NEXT: movaps %xmm2, %xmm1
-; SSE41-NEXT: retq
+; SSE-LABEL: trunc8i64_8i32_ashr:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
+; SSE-NEXT: movaps %xmm2, %xmm1
+; SSE-NEXT: retq
;
; AVX1-LABEL: trunc8i64_8i32_ashr:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[0,2]
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[0,2]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
More information about the llvm-commits
mailing list