[llvm] 943b556 - [LegalizeTypes][X86] Add a new strategy for type legalizing f16 type that softens it to i16, but promotes to f32 around arithmetic ops.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sat Feb 1 11:21:51 PST 2020
Author: Craig Topper
Date: 2020-02-01T11:21:04-08:00
New Revision: 943b5561d6a6fccee7fbaa8842074563f8b66927
URL: https://github.com/llvm/llvm-project/commit/943b5561d6a6fccee7fbaa8842074563f8b66927
DIFF: https://github.com/llvm/llvm-project/commit/943b5561d6a6fccee7fbaa8842074563f8b66927.diff
LOG: [LegalizeTypes][X86] Add a new strategy for type legalizing f16 type that softens it to i16, but promotes to f32 around arithmetic ops.
This is based on this llvm-dev thread http://lists.llvm.org/pipermail/llvm-dev/2019-December/137521.html
The current strategy for f16 is to promote type to float every except where the specific width is required like loads, stores, and bitcasts. This results in rounding occurring in odd places instead of immediately after arithmetic operations. This interacts in weird ways with the __fp16 type in clang which is a storage only type where arithmetic is always promoted to float. InstCombine can remove some fpext/fptruncs around such arithmetic and turn it into arithmetic on half. This wouldn't be so bad if SelectionDAG was able to put those fpext/fpround back in when it promotes.
It is also not obvious how to handle to make the existing strategy work with STRICT fp. We need to use STRICT versions of the conversions which require chain operands. But if the conversions are created for a bitcast, there is no place to get an appropriate chain from.
This patch implements a different strategy where conversions are emitted directly around arithmetic operations. And otherwise its passed around as an i16 including in arguments and return values. This can result in more conversions between arithmetic operations, but is closer to matching the IR the frontend generates for __fp16. And it will allow us to use the chain from constrained arithmetic nodes to link the STRICT_FP_TO_FP16/STRICT_FP16_TO_FP that will need to be added. I've set it up so that each target can opt into the new behavior. Converting all the targets myself was more than I was able to handle.
Differential Revision: https://reviews.llvm.org/D73749
Added:
Modified:
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
llvm/lib/CodeGen/TargetLoweringBase.cpp
llvm/lib/Target/X86/X86ISelLowering.h
llvm/test/CodeGen/X86/atomic-non-integer.ll
llvm/test/CodeGen/X86/avx512-insert-extract.ll
llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll
llvm/test/CodeGen/X86/avx512-vec-cmp.ll
llvm/test/CodeGen/X86/fmf-flags.ll
llvm/test/CodeGen/X86/half.ll
llvm/test/CodeGen/X86/mxcsr-reg-usage.ll
llvm/test/CodeGen/X86/pr31088.ll
llvm/test/CodeGen/X86/pr38533.ll
llvm/test/CodeGen/X86/shuffle-extract-subvector.ll
llvm/test/CodeGen/X86/vec_fp_to_int.ll
llvm/test/CodeGen/X86/vector-half-conversions.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 6542ff72b2ab..55967b32bce7 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -174,7 +174,8 @@ class TargetLoweringBase {
TypeScalarizeVector, // Replace this one-element vector with its element.
TypeSplitVector, // Split this vector into two of half the size.
TypeWidenVector, // This vector should be widened into a larger vector.
- TypePromoteFloat // Replace this float with a larger one.
+ TypePromoteFloat, // Replace this float with a larger one.
+ TypeSoftPromoteHalf, // Soften half to i16 and use float to do arithmetic.
};
/// LegalizeKind holds the legalization kind that needs to happen to EVT
@@ -374,6 +375,12 @@ class TargetLoweringBase {
return TypePromoteInteger;
}
+ // Return true if the half type should be passed around as i16, but promoted
+ // to float around arithmetic. The default behavior is to pass around as
+ // float and convert around loads/stores/bitcasts and other places where
+ // the size matters.
+ virtual bool softPromoteHalfType() const { return false; }
+
// There are two general methods for expanding a BUILD_VECTOR node:
// 1. Use SCALAR_TO_VECTOR on the defined scalar values and then shuffle
// them together.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index f191160dee4f..12fef02267ec 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2412,3 +2412,398 @@ SDValue DAGTypeLegalizer::BitcastToInt_ATOMIC_SWAP(SDNode *N) {
}
+//===----------------------------------------------------------------------===//
+// Half Result Soft Promotion
+//===----------------------------------------------------------------------===//
+
+void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
+ LLVM_DEBUG(dbgs() << "Soft promote half result " << ResNo << ": ";
+ N->dump(&DAG); dbgs() << "\n");
+ SDValue R = SDValue();
+
+ // See if the target wants to custom expand this node.
+ if (CustomLowerNode(N, N->getValueType(ResNo), true)) {
+ LLVM_DEBUG(dbgs() << "Node has been custom expanded, done\n");
+ return;
+ }
+
+ switch (N->getOpcode()) {
+ default:
+#ifndef NDEBUG
+ dbgs() << "SoftPromoteHalfResult #" << ResNo << ": ";
+ N->dump(&DAG); dbgs() << "\n";
+#endif
+ llvm_unreachable("Do not know how to soft promote this operator's result!");
+
+ case ISD::BITCAST: R = SoftPromoteHalfRes_BITCAST(N); break;
+ case ISD::ConstantFP: R = SoftPromoteHalfRes_ConstantFP(N); break;
+ case ISD::EXTRACT_VECTOR_ELT:
+ R = SoftPromoteHalfRes_EXTRACT_VECTOR_ELT(N); break;
+ case ISD::FCOPYSIGN: R = SoftPromoteHalfRes_FCOPYSIGN(N); break;
+ case ISD::FP_ROUND: R = SoftPromoteHalfRes_FP_ROUND(N); break;
+
+ // Unary FP Operations
+ case ISD::FABS:
+ case ISD::FCBRT:
+ case ISD::FCEIL:
+ case ISD::FCOS:
+ case ISD::FEXP:
+ case ISD::FEXP2:
+ case ISD::FFLOOR:
+ case ISD::FLOG:
+ case ISD::FLOG2:
+ case ISD::FLOG10:
+ case ISD::FNEARBYINT:
+ case ISD::FNEG:
+ case ISD::FRINT:
+ case ISD::FROUND:
+ case ISD::FSIN:
+ case ISD::FSQRT:
+ case ISD::FTRUNC:
+ case ISD::FCANONICALIZE: R = SoftPromoteHalfRes_UnaryOp(N); break;
+
+ // Binary FP Operations
+ case ISD::FADD:
+ case ISD::FDIV:
+ case ISD::FMAXIMUM:
+ case ISD::FMINIMUM:
+ case ISD::FMAXNUM:
+ case ISD::FMINNUM:
+ case ISD::FMUL:
+ case ISD::FPOW:
+ case ISD::FREM:
+ case ISD::FSUB: R = SoftPromoteHalfRes_BinOp(N); break;
+
+ case ISD::FMA: // FMA is same as FMAD
+ case ISD::FMAD: R = SoftPromoteHalfRes_FMAD(N); break;
+
+ case ISD::FPOWI: R = SoftPromoteHalfRes_FPOWI(N); break;
+
+ case ISD::LOAD: R = SoftPromoteHalfRes_LOAD(N); break;
+ case ISD::SELECT: R = SoftPromoteHalfRes_SELECT(N); break;
+ case ISD::SELECT_CC: R = SoftPromoteHalfRes_SELECT_CC(N); break;
+ case ISD::SINT_TO_FP:
+ case ISD::UINT_TO_FP: R = SoftPromoteHalfRes_XINT_TO_FP(N); break;
+ case ISD::UNDEF: R = SoftPromoteHalfRes_UNDEF(N); break;
+ case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break;
+ }
+
+ if (R.getNode())
+ SetSoftPromotedHalf(SDValue(N, ResNo), R);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_BITCAST(SDNode *N) {
+ return BitConvertToInteger(N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_ConstantFP(SDNode *N) {
+ ConstantFPSDNode *CN = cast<ConstantFPSDNode>(N);
+
+ // Get the (bit-cast) APInt of the APFloat and build an integer constant
+ return DAG.getConstant(CN->getValueAPF().bitcastToAPInt(), SDLoc(CN),
+ MVT::i16);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_EXTRACT_VECTOR_ELT(SDNode *N) {
+ SDValue NewOp = BitConvertVectorToIntegerVector(N->getOperand(0));
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
+ NewOp.getValueType().getVectorElementType(), NewOp,
+ N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FCOPYSIGN(SDNode *N) {
+ SDValue LHS = GetSoftPromotedHalf(N->getOperand(0));
+ SDValue RHS = BitConvertToInteger(N->getOperand(1));
+ SDLoc dl(N);
+
+ EVT LVT = LHS.getValueType();
+ EVT RVT = RHS.getValueType();
+
+ unsigned LSize = LVT.getSizeInBits();
+ unsigned RSize = RVT.getSizeInBits();
+
+ // First get the sign bit of second operand.
+ SDValue SignBit = DAG.getNode(
+ ISD::SHL, dl, RVT, DAG.getConstant(1, dl, RVT),
+ DAG.getConstant(RSize - 1, dl,
+ TLI.getShiftAmountTy(RVT, DAG.getDataLayout())));
+ SignBit = DAG.getNode(ISD::AND, dl, RVT, RHS, SignBit);
+
+ // Shift right or sign-extend it if the two operands have
diff erent types.
+ int SizeDiff = RVT.getSizeInBits() - LVT.getSizeInBits();
+ if (SizeDiff > 0) {
+ SignBit =
+ DAG.getNode(ISD::SRL, dl, RVT, SignBit,
+ DAG.getConstant(SizeDiff, dl,
+ TLI.getShiftAmountTy(SignBit.getValueType(),
+ DAG.getDataLayout())));
+ SignBit = DAG.getNode(ISD::TRUNCATE, dl, LVT, SignBit);
+ } else if (SizeDiff < 0) {
+ SignBit = DAG.getNode(ISD::ANY_EXTEND, dl, LVT, SignBit);
+ SignBit =
+ DAG.getNode(ISD::SHL, dl, LVT, SignBit,
+ DAG.getConstant(-SizeDiff, dl,
+ TLI.getShiftAmountTy(SignBit.getValueType(),
+ DAG.getDataLayout())));
+ }
+
+ // Clear the sign bit of the first operand.
+ SDValue Mask = DAG.getNode(
+ ISD::SHL, dl, LVT, DAG.getConstant(1, dl, LVT),
+ DAG.getConstant(LSize - 1, dl,
+ TLI.getShiftAmountTy(LVT, DAG.getDataLayout())));
+ Mask = DAG.getNode(ISD::SUB, dl, LVT, Mask, DAG.getConstant(1, dl, LVT));
+ LHS = DAG.getNode(ISD::AND, dl, LVT, LHS, Mask);
+
+ // Or the value with the sign bit.
+ return DAG.getNode(ISD::OR, dl, LVT, LHS, SignBit);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) {
+ EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+ SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0));
+ SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1));
+ SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
+ SDLoc dl(N);
+
+ // Promote to the larger FP type.
+ Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0);
+ Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1);
+ Op2 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op2);
+
+ SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1, Op2);
+
+ // Convert back to FP16 as an integer.
+ return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FPOWI(SDNode *N) {
+ EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+ SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0));
+ SDValue Op1 = N->getOperand(1);
+ SDLoc dl(N);
+
+ Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0);
+
+ SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1);
+
+ // Convert back to FP16 as an integer.
+ return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FP_ROUND(SDNode *N) {
+ return DAG.getNode(ISD::FP_TO_FP16, SDLoc(N), MVT::i16, N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_LOAD(SDNode *N) {
+ LoadSDNode *L = cast<LoadSDNode>(N);
+
+ // Load the value as an integer value with the same number of bits.
+ assert(L->getExtensionType() == ISD::NON_EXTLOAD && "Unexpected extension!");
+ SDValue NewL =
+ DAG.getLoad(L->getAddressingMode(), L->getExtensionType(), MVT::i16,
+ SDLoc(N), L->getChain(), L->getBasePtr(), L->getOffset(),
+ L->getPointerInfo(), MVT::i16, L->getAlignment(),
+ L->getMemOperand()->getFlags(), L->getAAInfo());
+ // Legalize the chain result by replacing uses of the old value chain with the
+ // new one
+ ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
+ return NewL;
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_SELECT(SDNode *N) {
+ SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1));
+ SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
+ return DAG.getSelect(SDLoc(N), Op1.getValueType(), N->getOperand(0), Op1,
+ Op2);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_SELECT_CC(SDNode *N) {
+ SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
+ SDValue Op3 = GetSoftPromotedHalf(N->getOperand(3));
+ return DAG.getNode(ISD::SELECT_CC, SDLoc(N), Op2.getValueType(),
+ N->getOperand(0), N->getOperand(1), Op2, Op3,
+ N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_XINT_TO_FP(SDNode *N) {
+ EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+ SDLoc dl(N);
+
+ SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
+
+ // Round the value to the softened type.
+ return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_UNDEF(SDNode *N) {
+ return DAG.getUNDEF(MVT::i16);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_UnaryOp(SDNode *N) {
+ EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+ SDValue Op = GetSoftPromotedHalf(N->getOperand(0));
+ SDLoc dl(N);
+
+ // Promote to the larger FP type.
+ Op = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op);
+
+ SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op);
+
+ // Convert back to FP16 as an integer.
+ return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_BinOp(SDNode *N) {
+ EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+ SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0));
+ SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1));
+ SDLoc dl(N);
+
+ // Promote to the larger FP type.
+ Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0);
+ Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1);
+
+ SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1);
+
+ // Convert back to FP16 as an integer.
+ return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res);
+}
+
+//===----------------------------------------------------------------------===//
+// Half Operand Soft Promotion
+//===----------------------------------------------------------------------===//
+
+bool DAGTypeLegalizer::SoftPromoteHalfOperand(SDNode *N, unsigned OpNo) {
+ LLVM_DEBUG(dbgs() << "Soft promote half operand " << OpNo << ": ";
+ N->dump(&DAG); dbgs() << "\n");
+ SDValue Res = SDValue();
+
+ if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false)) {
+ LLVM_DEBUG(dbgs() << "Node has been custom lowered, done\n");
+ return false;
+ }
+
+ // Nodes that use a promotion-requiring floating point operand, but doesn't
+ // produce a soft promotion-requiring floating point result, need to be
+ // legalized to use the soft promoted float operand. Nodes that produce at
+ // least one soft promotion-requiring floating point result have their
+ // operands legalized as a part of PromoteFloatResult.
+ switch (N->getOpcode()) {
+ default:
+ #ifndef NDEBUG
+ dbgs() << "SoftPromoteHalfOperand Op #" << OpNo << ": ";
+ N->dump(&DAG); dbgs() << "\n";
+ #endif
+ llvm_unreachable("Do not know how to soft promote this operator's operand!");
+
+ case ISD::BITCAST: Res = SoftPromoteHalfOp_BITCAST(N); break;
+ case ISD::FCOPYSIGN: Res = SoftPromoteHalfOp_FCOPYSIGN(N, OpNo); break;
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT: Res = SoftPromoteHalfOp_FP_TO_XINT(N); break;
+ case ISD::FP_EXTEND: Res = SoftPromoteHalfOp_FP_EXTEND(N); break;
+ case ISD::SELECT_CC: Res = SoftPromoteHalfOp_SELECT_CC(N, OpNo); break;
+ case ISD::SETCC: Res = SoftPromoteHalfOp_SETCC(N); break;
+ case ISD::STORE: Res = SoftPromoteHalfOp_STORE(N, OpNo); break;
+ }
+
+ if (!Res.getNode())
+ return false;
+
+ assert(Res.getNode() != N && "Expected a new node!");
+
+ assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+ "Invalid operand expansion");
+
+ ReplaceValueWith(SDValue(N, 0), Res);
+ return false;
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_BITCAST(SDNode *N) {
+ SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0));
+
+ return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Op0);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FCOPYSIGN(SDNode *N,
+ unsigned OpNo) {
+ assert(OpNo == 1 && "Only Operand 1 must need promotion here");
+ SDValue Op1 = N->getOperand(1);
+ SDLoc dl(N);
+
+ EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op1.getValueType());
+
+ Op1 = GetSoftPromotedHalf(Op1);
+ Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1);
+
+ return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), N->getOperand(0),
+ Op1);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FP_EXTEND(SDNode *N) {
+ SDValue Op = GetSoftPromotedHalf(N->getOperand(0));
+ return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0), Op);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FP_TO_XINT(SDNode *N) {
+ SDValue Op = N->getOperand(0);
+ SDLoc dl(N);
+
+ EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType());
+
+ Op = GetSoftPromotedHalf(Op);
+
+ SDValue Res = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op);
+
+ return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), Res);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_SELECT_CC(SDNode *N,
+ unsigned OpNo) {
+ assert(OpNo == 0 && "Can only soften the comparison values");
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ SDLoc dl(N);
+
+ EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op0.getValueType());
+
+ Op0 = GetSoftPromotedHalf(Op0);
+ Op1 = GetSoftPromotedHalf(Op1);
+
+ // Promote to the larger FP type.
+ Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0);
+ Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1);
+
+ return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0), Op0, Op1,
+ N->getOperand(2), N->getOperand(3), N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_SETCC(SDNode *N) {
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ SDLoc dl(N);
+
+ EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op0.getValueType());
+
+ Op0 = GetSoftPromotedHalf(Op0);
+ Op1 = GetSoftPromotedHalf(Op1);
+
+ // Promote to the larger FP type.
+ Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0);
+ Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1);
+
+ return DAG.getSetCC(SDLoc(N), N->getValueType(0), Op0, Op1, CCCode);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_STORE(SDNode *N, unsigned OpNo) {
+ assert(OpNo == 1 && "Can only soften the stored value!");
+ StoreSDNode *ST = cast<StoreSDNode>(N);
+ SDValue Val = ST->getValue();
+ SDLoc dl(N);
+
+ assert(!ST->isTruncatingStore() && "Unexpected truncating store.");
+ SDValue Promoted = GetSoftPromotedHalf(Val);
+ return DAG.getStore(ST->getChain(), dl, Promoted, ST->getBasePtr(),
+ ST->getMemOperand());
+}
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 89edddb42be1..1c6ed3d48f47 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -304,6 +304,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
case TargetLowering::TypeSoftenFloat:
// Promote the integer operand by hand.
return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, GetSoftenedFloat(InOp));
+ case TargetLowering::TypeSoftPromoteHalf:
+ // Promote the integer operand by hand.
+ return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, GetSoftPromotedHalf(InOp));
case TargetLowering::TypePromoteFloat: {
// Convert the promoted float by hand.
if (!NOutVT.isVector())
@@ -2689,6 +2692,12 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_SINT(SDNode *N, SDValue &Lo,
if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat)
Op = GetPromotedFloat(Op);
+ if (getTypeAction(Op.getValueType()) == TargetLowering::TypeSoftPromoteHalf) {
+ EVT NFPVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType());
+ Op = GetSoftPromotedHalf(Op);
+ Op = DAG.getNode(ISD::FP16_TO_FP, dl, NFPVT, Op);
+ }
+
RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), VT);
assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-sint conversion!");
TargetLowering::MakeLibCallOptions CallOptions;
@@ -2712,6 +2721,12 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo,
if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat)
Op = GetPromotedFloat(Op);
+ if (getTypeAction(Op.getValueType()) == TargetLowering::TypeSoftPromoteHalf) {
+ EVT NFPVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType());
+ Op = GetSoftPromotedHalf(Op);
+ Op = DAG.getNode(ISD::FP16_TO_FP, dl, NFPVT, Op);
+ }
+
RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), VT);
assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-uint conversion!");
TargetLowering::MakeLibCallOptions CallOptions;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 63ddb59fce68..417fbe77ea00 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -124,6 +124,8 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
Mapped |= 128;
if (ResId && PromotedFloats.find(ResId) != PromotedFloats.end())
Mapped |= 256;
+ if (ResId && SoftPromotedHalfs.find(ResId) != SoftPromotedHalfs.end())
+ Mapped |= 512;
if (Node.getNodeId() != Processed) {
// Since we allow ReplacedValues to map deleted nodes, it may map nodes
@@ -168,6 +170,8 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
dbgs() << " WidenedVectors";
if (Mapped & 256)
dbgs() << " PromotedFloats";
+ if (Mapped & 512)
+ dbgs() << " SoftPromoteHalfs";
dbgs() << "\n";
llvm_unreachable(nullptr);
}
@@ -276,6 +280,10 @@ bool DAGTypeLegalizer::run() {
PromoteFloatResult(N, i);
Changed = true;
goto NodeDone;
+ case TargetLowering::TypeSoftPromoteHalf:
+ SoftPromoteHalfResult(N, i);
+ Changed = true;
+ goto NodeDone;
}
}
@@ -332,6 +340,10 @@ bool DAGTypeLegalizer::run() {
NeedsReanalyzing = PromoteFloatOperand(N, i);
Changed = true;
break;
+ case TargetLowering::TypeSoftPromoteHalf:
+ NeedsReanalyzing = SoftPromoteHalfOperand(N, i);
+ Changed = true;
+ break;
}
break;
}
@@ -719,6 +731,16 @@ void DAGTypeLegalizer::SetPromotedFloat(SDValue Op, SDValue Result) {
OpIdEntry = getTableId(Result);
}
+void DAGTypeLegalizer::SetSoftPromotedHalf(SDValue Op, SDValue Result) {
+ assert(Result.getValueType() == MVT::i16 &&
+ "Invalid type for soft-promoted half");
+ AnalyzeNewValue(Result);
+
+ auto &OpIdEntry = SoftPromotedHalfs[getTableId(Op)];
+ assert((OpIdEntry == 0) && "Node is already promoted!");
+ OpIdEntry = getTableId(Result);
+}
+
void DAGTypeLegalizer::SetScalarizedVector(SDValue Op, SDValue Result) {
// Note that in some cases vector operation operands may be greater than
// the vector element type. For example BUILD_VECTOR of type <1 x i1> with
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 10351ca5fcbf..8126c42e4498 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -109,6 +109,10 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
/// supported precision, this map indicates what promoted value to use.
SmallDenseMap<TableId, TableId, 8> PromotedFloats;
+ /// For floating-point nodes that have a smaller precision than the smallest
+ /// supported precision, this map indicates the converted value to use.
+ SmallDenseMap<TableId, TableId, 8> SoftPromotedHalfs;
+
/// For float nodes that need to be expanded this map indicates which operands
/// are the expanded version of the input.
SmallDenseMap<TableId, std::pair<TableId, TableId>, 8> ExpandedFloats;
@@ -186,6 +190,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
ExpandedIntegers.erase(OldId);
SoftenedFloats.erase(OldId);
PromotedFloats.erase(OldId);
+ SoftPromotedHalfs.erase(OldId);
ExpandedFloats.erase(OldId);
ScalarizedVectors.erase(OldId);
SplitVectors.erase(OldId);
@@ -651,6 +656,43 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue PromoteFloatOp_SELECT_CC(SDNode *N, unsigned OpNo);
SDValue PromoteFloatOp_SETCC(SDNode *N, unsigned OpNo);
+ //===--------------------------------------------------------------------===//
+ // Half soft promotion support: LegalizeFloatTypes.cpp
+ //===--------------------------------------------------------------------===//
+
+ SDValue GetSoftPromotedHalf(SDValue Op) {
+ TableId &PromotedId = SoftPromotedHalfs[getTableId(Op)];
+ SDValue PromotedOp = getSDValue(PromotedId);
+ assert(PromotedOp.getNode() && "Operand wasn't promoted?");
+ return PromotedOp;
+ }
+ void SetSoftPromotedHalf(SDValue Op, SDValue Result);
+
+ void SoftPromoteHalfResult(SDNode *N, unsigned ResNo);
+ SDValue SoftPromoteHalfRes_BinOp(SDNode *N);
+ SDValue SoftPromoteHalfRes_BITCAST(SDNode *N);
+ SDValue SoftPromoteHalfRes_ConstantFP(SDNode *N);
+ SDValue SoftPromoteHalfRes_EXTRACT_VECTOR_ELT(SDNode *N);
+ SDValue SoftPromoteHalfRes_FCOPYSIGN(SDNode *N);
+ SDValue SoftPromoteHalfRes_FMAD(SDNode *N);
+ SDValue SoftPromoteHalfRes_FPOWI(SDNode *N);
+ SDValue SoftPromoteHalfRes_FP_ROUND(SDNode *N);
+ SDValue SoftPromoteHalfRes_LOAD(SDNode *N);
+ SDValue SoftPromoteHalfRes_SELECT(SDNode *N);
+ SDValue SoftPromoteHalfRes_SELECT_CC(SDNode *N);
+ SDValue SoftPromoteHalfRes_UnaryOp(SDNode *N);
+ SDValue SoftPromoteHalfRes_XINT_TO_FP(SDNode *N);
+ SDValue SoftPromoteHalfRes_UNDEF(SDNode *N);
+
+ bool SoftPromoteHalfOperand(SDNode *N, unsigned OpNo);
+ SDValue SoftPromoteHalfOp_BITCAST(SDNode *N);
+ SDValue SoftPromoteHalfOp_FCOPYSIGN(SDNode *N, unsigned OpNo);
+ SDValue SoftPromoteHalfOp_FP_EXTEND(SDNode *N);
+ SDValue SoftPromoteHalfOp_FP_TO_XINT(SDNode *N);
+ SDValue SoftPromoteHalfOp_SETCC(SDNode *N);
+ SDValue SoftPromoteHalfOp_SELECT_CC(SDNode *N, unsigned OpNo);
+ SDValue SoftPromoteHalfOp_STORE(SDNode *N, unsigned OpNo);
+
//===--------------------------------------------------------------------===//
// Scalarization Support: LegalizeVectorTypes.cpp
//===--------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index a11319ce85cb..ad3e02f9921a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -50,6 +50,7 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
case TargetLowering::TypePromoteInteger:
break;
case TargetLowering::TypePromoteFloat:
+ case TargetLowering::TypeSoftPromoteHalf:
llvm_unreachable("Bitcast of a promotion-needing float should never need"
"expansion");
case TargetLowering::TypeSoftenFloat:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 5de71aebb279..a8e98ebf1540 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1025,6 +1025,7 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
case TargetLowering::TypeLegal:
case TargetLowering::TypePromoteInteger:
case TargetLowering::TypePromoteFloat:
+ case TargetLowering::TypeSoftPromoteHalf:
case TargetLowering::TypeSoftenFloat:
case TargetLowering::TypeScalarizeVector:
case TargetLowering::TypeWidenVector:
@@ -3468,6 +3469,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
}
case TargetLowering::TypeSoftenFloat:
case TargetLowering::TypePromoteFloat:
+ case TargetLowering::TypeSoftPromoteHalf:
case TargetLowering::TypeExpandInteger:
case TargetLowering::TypeExpandFloat:
case TargetLowering::TypeScalarizeVector:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index e40e81631ca3..e65960e1281d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -482,9 +482,14 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
// Handle cases such as i8 -> <1 x i1>
EVT ValueSVT = ValueVT.getVectorElementType();
- if (ValueVT.getVectorNumElements() == 1 && ValueSVT != PartEVT)
- Val = ValueVT.isFloatingPoint() ? DAG.getFPExtendOrRound(Val, DL, ValueSVT)
- : DAG.getAnyExtOrTrunc(Val, DL, ValueSVT);
+ if (ValueVT.getVectorNumElements() == 1 && ValueSVT != PartEVT) {
+ if (ValueSVT.getSizeInBits() == PartEVT.getSizeInBits())
+ Val = DAG.getNode(ISD::BITCAST, DL, ValueSVT, Val);
+ else
+ Val = ValueVT.isFloatingPoint()
+ ? DAG.getFPExtendOrRound(Val, DL, ValueSVT)
+ : DAG.getAnyExtOrTrunc(Val, DL, ValueSVT);
+ }
return DAG.getBuildVector(ValueVT, DL, Val);
}
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 762a06176bdc..3a3bb8cd2405 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -811,6 +811,7 @@ TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
LegalizeTypeAction LA = ValueTypeActions.getTypeAction(SVT);
assert((LA == TypeLegal || LA == TypeSoftenFloat ||
+ LA == TypeSoftPromoteHalf ||
(NVT.isVector() ||
ValueTypeActions.getTypeAction(NVT) != TypePromoteInteger)) &&
"Promote may not follow Expand or Promote");
@@ -1229,10 +1230,18 @@ void TargetLoweringBase::computeRegisterProperties(
// promote it to f32, because there are no f16 library calls (except for
// conversions).
if (!isTypeLegal(MVT::f16)) {
- NumRegistersForVT[MVT::f16] = NumRegistersForVT[MVT::f32];
- RegisterTypeForVT[MVT::f16] = RegisterTypeForVT[MVT::f32];
- TransformToType[MVT::f16] = MVT::f32;
- ValueTypeActions.setTypeAction(MVT::f16, TypePromoteFloat);
+ // Allow targets to control how we legalize half.
+ if (softPromoteHalfType()) {
+ NumRegistersForVT[MVT::f16] = NumRegistersForVT[MVT::i16];
+ RegisterTypeForVT[MVT::f16] = RegisterTypeForVT[MVT::i16];
+ TransformToType[MVT::f16] = MVT::f32;
+ ValueTypeActions.setTypeAction(MVT::f16, TypeSoftPromoteHalf);
+ } else {
+ NumRegistersForVT[MVT::f16] = NumRegistersForVT[MVT::f32];
+ RegisterTypeForVT[MVT::f16] = RegisterTypeForVT[MVT::f32];
+ TransformToType[MVT::f16] = MVT::f32;
+ ValueTypeActions.setTypeAction(MVT::f16, TypePromoteFloat);
+ }
}
// Loop over all of the vector value types to see which need transformations.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 860bef4ef4d9..c14ccc901ecf 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1226,6 +1226,8 @@ namespace llvm {
/// Customize the preferred legalization strategy for certain types.
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
+ bool softPromoteHalfType() const override { return true; }
+
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
EVT VT) const override;
diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll
index 8fd96b749a39..d3aae068dd60 100644
--- a/llvm/test/CodeGen/X86/atomic-non-integer.ll
+++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll
@@ -16,99 +16,17 @@
; and their calling convention which remain unresolved.)
define void @store_half(half* %fptr, half %v) {
-; X86-SSE-LABEL: store_half:
-; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pushl %esi
-; X86-SSE-NEXT: .cfi_def_cfa_offset 8
-; X86-SSE-NEXT: subl $8, %esp
-; X86-SSE-NEXT: .cfi_def_cfa_offset 16
-; X86-SSE-NEXT: .cfi_offset %esi, -8
-; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: movss %xmm0, (%esp)
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT: calll __gnu_f2h_ieee
-; X86-SSE-NEXT: movw %ax, (%esi)
-; X86-SSE-NEXT: addl $8, %esp
-; X86-SSE-NEXT: .cfi_def_cfa_offset 8
-; X86-SSE-NEXT: popl %esi
-; X86-SSE-NEXT: .cfi_def_cfa_offset 4
-; X86-SSE-NEXT: retl
-;
-; X86-AVX1-LABEL: store_half:
-; X86-AVX1: # %bb.0:
-; X86-AVX1-NEXT: pushl %esi
-; X86-AVX1-NEXT: .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT: subl $8, %esp
-; X86-AVX1-NEXT: .cfi_def_cfa_offset 16
-; X86-AVX1-NEXT: .cfi_offset %esi, -8
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-AVX1-NEXT: calll __gnu_f2h_ieee
-; X86-AVX1-NEXT: movw %ax, (%esi)
-; X86-AVX1-NEXT: addl $8, %esp
-; X86-AVX1-NEXT: .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT: popl %esi
-; X86-AVX1-NEXT: .cfi_def_cfa_offset 4
-; X86-AVX1-NEXT: retl
-;
-; X86-AVX512-LABEL: store_half:
-; X86-AVX512: # %bb.0:
-; X86-AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; X86-AVX512-NEXT: vmovd %xmm0, %eax
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT: movw %ax, (%ecx)
-; X86-AVX512-NEXT: retl
-;
-; X86-NOSSE-LABEL: store_half:
-; X86-NOSSE: # %bb.0:
-; X86-NOSSE-NEXT: pushl %esi
-; X86-NOSSE-NEXT: .cfi_def_cfa_offset 8
-; X86-NOSSE-NEXT: subl $8, %esp
-; X86-NOSSE-NEXT: .cfi_def_cfa_offset 16
-; X86-NOSSE-NEXT: .cfi_offset %esi, -8
-; X86-NOSSE-NEXT: flds {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT: fstps (%esp)
-; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT: calll __gnu_f2h_ieee
-; X86-NOSSE-NEXT: movw %ax, (%esi)
-; X86-NOSSE-NEXT: addl $8, %esp
-; X86-NOSSE-NEXT: .cfi_def_cfa_offset 8
-; X86-NOSSE-NEXT: popl %esi
-; X86-NOSSE-NEXT: .cfi_def_cfa_offset 4
-; X86-NOSSE-NEXT: retl
-;
-; X64-SSE-LABEL: store_half:
-; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: pushq %rbx
-; X64-SSE-NEXT: .cfi_def_cfa_offset 16
-; X64-SSE-NEXT: .cfi_offset %rbx, -16
-; X64-SSE-NEXT: movq %rdi, %rbx
-; X64-SSE-NEXT: callq __gnu_f2h_ieee
-; X64-SSE-NEXT: movw %ax, (%rbx)
-; X64-SSE-NEXT: popq %rbx
-; X64-SSE-NEXT: .cfi_def_cfa_offset 8
-; X64-SSE-NEXT: retq
+; X86-LABEL: store_half:
+; X86: # %bb.0:
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movw %ax, (%ecx)
+; X86-NEXT: retl
;
-; X64-AVX1-LABEL: store_half:
-; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: pushq %rbx
-; X64-AVX1-NEXT: .cfi_def_cfa_offset 16
-; X64-AVX1-NEXT: .cfi_offset %rbx, -16
-; X64-AVX1-NEXT: movq %rdi, %rbx
-; X64-AVX1-NEXT: callq __gnu_f2h_ieee
-; X64-AVX1-NEXT: movw %ax, (%rbx)
-; X64-AVX1-NEXT: popq %rbx
-; X64-AVX1-NEXT: .cfi_def_cfa_offset 8
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX512-LABEL: store_half:
-; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; X64-AVX512-NEXT: vmovd %xmm0, %eax
-; X64-AVX512-NEXT: movw %ax, (%rdi)
-; X64-AVX512-NEXT: retq
+; X64-LABEL: store_half:
+; X64: # %bb.0:
+; X64-NEXT: movw %si, (%rdi)
+; X64-NEXT: retq
store atomic half %v, half* %fptr unordered, align 2
ret void
}
@@ -302,82 +220,16 @@ define void @store_fp128(fp128* %fptr, fp128 %v) {
}
define half @load_half(half* %fptr) {
-; X86-SSE-LABEL: load_half:
-; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: subl $12, %esp
-; X86-SSE-NEXT: .cfi_def_cfa_offset 16
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movzwl (%eax), %eax
-; X86-SSE-NEXT: movl %eax, (%esp)
-; X86-SSE-NEXT: calll __gnu_h2f_ieee
-; X86-SSE-NEXT: addl $12, %esp
-; X86-SSE-NEXT: .cfi_def_cfa_offset 4
-; X86-SSE-NEXT: retl
-;
-; X86-AVX1-LABEL: load_half:
-; X86-AVX1: # %bb.0:
-; X86-AVX1-NEXT: subl $12, %esp
-; X86-AVX1-NEXT: .cfi_def_cfa_offset 16
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT: movzwl (%eax), %eax
-; X86-AVX1-NEXT: movl %eax, (%esp)
-; X86-AVX1-NEXT: calll __gnu_h2f_ieee
-; X86-AVX1-NEXT: addl $12, %esp
-; X86-AVX1-NEXT: .cfi_def_cfa_offset 4
-; X86-AVX1-NEXT: retl
-;
-; X86-AVX512-LABEL: load_half:
-; X86-AVX512: # %bb.0:
-; X86-AVX512-NEXT: pushl %eax
-; X86-AVX512-NEXT: .cfi_def_cfa_offset 8
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: movswl (%eax), %eax
-; X86-AVX512-NEXT: vmovd %eax, %xmm0
-; X86-AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; X86-AVX512-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX512-NEXT: flds (%esp)
-; X86-AVX512-NEXT: popl %eax
-; X86-AVX512-NEXT: .cfi_def_cfa_offset 4
-; X86-AVX512-NEXT: retl
-;
-; X86-NOSSE-LABEL: load_half:
-; X86-NOSSE: # %bb.0:
-; X86-NOSSE-NEXT: subl $12, %esp
-; X86-NOSSE-NEXT: .cfi_def_cfa_offset 16
-; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT: movzwl (%eax), %eax
-; X86-NOSSE-NEXT: movl %eax, (%esp)
-; X86-NOSSE-NEXT: calll __gnu_h2f_ieee
-; X86-NOSSE-NEXT: addl $12, %esp
-; X86-NOSSE-NEXT: .cfi_def_cfa_offset 4
-; X86-NOSSE-NEXT: retl
-;
-; X64-SSE-LABEL: load_half:
-; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: pushq %rax
-; X64-SSE-NEXT: .cfi_def_cfa_offset 16
-; X64-SSE-NEXT: movzwl (%rdi), %edi
-; X64-SSE-NEXT: callq __gnu_h2f_ieee
-; X64-SSE-NEXT: popq %rax
-; X64-SSE-NEXT: .cfi_def_cfa_offset 8
-; X64-SSE-NEXT: retq
+; X86-LABEL: load_half:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movzwl (%eax), %eax
+; X86-NEXT: retl
;
-; X64-AVX1-LABEL: load_half:
-; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: pushq %rax
-; X64-AVX1-NEXT: .cfi_def_cfa_offset 16
-; X64-AVX1-NEXT: movzwl (%rdi), %edi
-; X64-AVX1-NEXT: callq __gnu_h2f_ieee
-; X64-AVX1-NEXT: popq %rax
-; X64-AVX1-NEXT: .cfi_def_cfa_offset 8
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX512-LABEL: load_half:
-; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: movswl (%rdi), %eax
-; X64-AVX512-NEXT: vmovd %eax, %xmm0
-; X64-AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; X64-AVX512-NEXT: retq
+; X64-LABEL: load_half:
+; X64: # %bb.0:
+; X64-NEXT: movzwl (%rdi), %eax
+; X64-NEXT: retq
%v = load atomic half, half* %fptr unordered, align 2
ret half %v
}
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index 9fdfeebda7f5..9b6baf217564 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -2264,96 +2264,100 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index
define void @test_concat_v2i1(<2 x half>* %arg, <2 x half>* %arg1, <2 x half>* %arg2) {
; KNL-LABEL: test_concat_v2i1:
; KNL: ## %bb.0:
-; KNL-NEXT: movswl (%rdi), %eax
+; KNL-NEXT: movswl 2(%rdi), %eax
; KNL-NEXT: vmovd %eax, %xmm0
; KNL-NEXT: vcvtph2ps %xmm0, %xmm0
-; KNL-NEXT: movswl 2(%rdi), %eax
-; KNL-NEXT: vmovd %eax, %xmm1
-; KNL-NEXT: vcvtph2ps %xmm1, %xmm1
-; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; KNL-NEXT: vucomiss %xmm2, %xmm1
+; KNL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; KNL-NEXT: vucomiss %xmm1, %xmm0
; KNL-NEXT: setb %al
; KNL-NEXT: kmovw %eax, %k0
; KNL-NEXT: kshiftlw $1, %k0, %k0
-; KNL-NEXT: vucomiss %xmm2, %xmm0
+; KNL-NEXT: movswl (%rdi), %eax
+; KNL-NEXT: vmovd %eax, %xmm2
+; KNL-NEXT: vcvtph2ps %xmm2, %xmm2
+; KNL-NEXT: vucomiss %xmm1, %xmm2
; KNL-NEXT: setb %al
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: korw %k0, %k1, %k0
-; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; KNL-NEXT: vucomiss %xmm2, %xmm1
+; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; KNL-NEXT: vucomiss %xmm1, %xmm0
; KNL-NEXT: seta %al
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: kshiftlw $1, %k1, %k1
-; KNL-NEXT: vucomiss %xmm2, %xmm0
+; KNL-NEXT: vucomiss %xmm1, %xmm2
; KNL-NEXT: seta %al
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: kmovw %eax, %k2
; KNL-NEXT: korw %k1, %k2, %k1
-; KNL-NEXT: kandw %k1, %k0, %k1
-; KNL-NEXT: kshiftrw $1, %k1, %k2
-; KNL-NEXT: movswl (%rsi), %eax
-; KNL-NEXT: vmovd %eax, %xmm0
-; KNL-NEXT: vcvtph2ps %xmm0, %xmm0
-; KNL-NEXT: movswl 2(%rsi), %eax
-; KNL-NEXT: vmovd %eax, %xmm1
-; KNL-NEXT: vcvtph2ps %xmm1, %xmm1
-; KNL-NEXT: vmovss %xmm1, %xmm1, %xmm1 {%k2} {z}
-; KNL-NEXT: vmovss %xmm0, %xmm0, %xmm0 {%k1} {z}
-; KNL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; KNL-NEXT: vmovd %xmm0, %eax
-; KNL-NEXT: movw %ax, (%rdx)
-; KNL-NEXT: vcvtps2ph $4, %xmm1, %xmm0
-; KNL-NEXT: vmovd %xmm0, %eax
+; KNL-NEXT: kandw %k1, %k0, %k0
+; KNL-NEXT: kshiftrw $1, %k0, %k1
+; KNL-NEXT: kmovw %k1, %edi
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: testb $1, %cl
+; KNL-NEXT: movl $0, %ecx
+; KNL-NEXT: je LBB85_2
+; KNL-NEXT: ## %bb.1:
+; KNL-NEXT: movzwl (%rsi), %ecx
+; KNL-NEXT: LBB85_2:
+; KNL-NEXT: testb $1, %dil
+; KNL-NEXT: je LBB85_4
+; KNL-NEXT: ## %bb.3:
+; KNL-NEXT: movzwl 2(%rsi), %eax
+; KNL-NEXT: LBB85_4:
; KNL-NEXT: movw %ax, 2(%rdx)
+; KNL-NEXT: movw %cx, (%rdx)
; KNL-NEXT: retq
;
; SKX-LABEL: test_concat_v2i1:
; SKX: ## %bb.0:
-; SKX-NEXT: movswl (%rdi), %eax
+; SKX-NEXT: movswl 2(%rdi), %eax
; SKX-NEXT: vmovd %eax, %xmm0
; SKX-NEXT: vcvtph2ps %xmm0, %xmm0
-; SKX-NEXT: movswl 2(%rdi), %eax
-; SKX-NEXT: vmovd %eax, %xmm1
-; SKX-NEXT: vcvtph2ps %xmm1, %xmm1
-; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SKX-NEXT: vucomiss %xmm2, %xmm1
+; SKX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SKX-NEXT: vucomiss %xmm1, %xmm0
; SKX-NEXT: setb %al
; SKX-NEXT: kmovd %eax, %k0
; SKX-NEXT: kshiftlb $1, %k0, %k0
-; SKX-NEXT: vucomiss %xmm2, %xmm0
+; SKX-NEXT: movswl (%rdi), %eax
+; SKX-NEXT: vmovd %eax, %xmm2
+; SKX-NEXT: vcvtph2ps %xmm2, %xmm2
+; SKX-NEXT: vucomiss %xmm1, %xmm2
; SKX-NEXT: setb %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: kshiftlb $7, %k1, %k1
; SKX-NEXT: kshiftrb $7, %k1, %k1
; SKX-NEXT: korw %k0, %k1, %k0
-; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; SKX-NEXT: vucomiss %xmm2, %xmm1
+; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; SKX-NEXT: vucomiss %xmm1, %xmm0
; SKX-NEXT: seta %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: kshiftlb $1, %k1, %k1
-; SKX-NEXT: vucomiss %xmm2, %xmm0
+; SKX-NEXT: vucomiss %xmm1, %xmm2
; SKX-NEXT: seta %al
; SKX-NEXT: kmovd %eax, %k2
; SKX-NEXT: kshiftlb $7, %k2, %k2
; SKX-NEXT: kshiftrb $7, %k2, %k2
; SKX-NEXT: korw %k1, %k2, %k1
-; SKX-NEXT: kandw %k1, %k0, %k1
-; SKX-NEXT: kshiftrb $1, %k1, %k2
-; SKX-NEXT: movswl (%rsi), %eax
-; SKX-NEXT: vmovd %eax, %xmm0
-; SKX-NEXT: vcvtph2ps %xmm0, %xmm0
-; SKX-NEXT: movswl 2(%rsi), %eax
-; SKX-NEXT: vmovd %eax, %xmm1
-; SKX-NEXT: vcvtph2ps %xmm1, %xmm1
-; SKX-NEXT: vmovss %xmm1, %xmm1, %xmm1 {%k2} {z}
-; SKX-NEXT: vmovss %xmm0, %xmm0, %xmm0 {%k1} {z}
-; SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; SKX-NEXT: vmovd %xmm0, %eax
-; SKX-NEXT: movw %ax, (%rdx)
-; SKX-NEXT: vcvtps2ph $4, %xmm1, %xmm0
-; SKX-NEXT: vmovd %xmm0, %eax
+; SKX-NEXT: kandw %k1, %k0, %k0
+; SKX-NEXT: kshiftrb $1, %k0, %k1
+; SKX-NEXT: kmovd %k1, %edi
+; SKX-NEXT: kmovd %k0, %ecx
+; SKX-NEXT: xorl %eax, %eax
+; SKX-NEXT: testb $1, %cl
+; SKX-NEXT: movl $0, %ecx
+; SKX-NEXT: je LBB85_2
+; SKX-NEXT: ## %bb.1:
+; SKX-NEXT: movzwl (%rsi), %ecx
+; SKX-NEXT: LBB85_2:
+; SKX-NEXT: testb $1, %dil
+; SKX-NEXT: je LBB85_4
+; SKX-NEXT: ## %bb.3:
+; SKX-NEXT: movzwl 2(%rsi), %eax
+; SKX-NEXT: LBB85_4:
; SKX-NEXT: movw %ax, 2(%rdx)
+; SKX-NEXT: movw %cx, (%rdx)
; SKX-NEXT: retq
%tmp = load <2 x half>, <2 x half>* %arg, align 8
%tmp3 = fcmp fast olt <2 x half> %tmp, <half 0xH4600, half 0xH4600>
diff --git a/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll b/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll
index e4f1e7f6afff..0a7c4e0aa36a 100644
--- a/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll
+++ b/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll
@@ -156,229 +156,203 @@ declare void @llvm.masked.store.v32i16.p0v32i16(<32 x i16>, <32 x i16>*, i32, <3
define <16 x half> @test_mask_load_16xf16(<16 x i1> %mask, <16 x half>* %addr, <16 x half> %val) {
; CHECK-LABEL: test_mask_load_16xf16:
; CHECK: ## %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
-; CHECK-NEXT: vpmovmskb %xmm0, %ecx
-; CHECK-NEXT: testb $1, %cl
+; CHECK-NEXT: vpmovmskb %xmm0, %r11d
+; CHECK-NEXT: testb $1, %r11b
; CHECK-NEXT: je LBB12_1
; CHECK-NEXT: ## %bb.2: ## %cond.load
-; CHECK-NEXT: movswl (%rsi), %edx
-; CHECK-NEXT: vmovd %edx, %xmm0
-; CHECK-NEXT: vcvtph2ps %xmm0, %xmm8
+; CHECK-NEXT: movzwl (%rsi), %ecx
+; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
; CHECK-NEXT: jmp LBB12_3
; CHECK-NEXT: LBB12_1:
-; CHECK-NEXT: vxorps %xmm8, %xmm8, %xmm8
+; CHECK-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
; CHECK-NEXT: LBB12_3: ## %else
-; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vxorps %xmm9, %xmm9, %xmm9
-; CHECK-NEXT: testb $2, %cl
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
+; CHECK-NEXT: movl %edi, %ecx
+; CHECK-NEXT: testb $2, %r11b
; CHECK-NEXT: je LBB12_4
; CHECK-NEXT: ## %bb.5: ## %cond.load1
-; CHECK-NEXT: movswl 2(%rsi), %edx
-; CHECK-NEXT: vmovd %edx, %xmm0
-; CHECK-NEXT: vmovaps %xmm2, %xmm1
-; CHECK-NEXT: vmovaps %xmm2, %xmm7
-; CHECK-NEXT: vmovaps %xmm2, %xmm6
-; CHECK-NEXT: vmovaps %xmm2, %xmm5
-; CHECK-NEXT: vmovaps %xmm2, %xmm4
-; CHECK-NEXT: vmovaps %xmm2, %xmm3
-; CHECK-NEXT: vmovaps %xmm2, %xmm16
-; CHECK-NEXT: vmovaps %xmm2, %xmm15
-; CHECK-NEXT: vmovaps %xmm2, %xmm14
-; CHECK-NEXT: vmovaps %xmm2, %xmm13
-; CHECK-NEXT: vmovaps %xmm2, %xmm12
-; CHECK-NEXT: vmovaps %xmm2, %xmm11
-; CHECK-NEXT: vmovaps %xmm2, %xmm10
-; CHECK-NEXT: vcvtph2ps %xmm0, %xmm2
-; CHECK-NEXT: testb $4, %cl
+; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT: movl %edi, %r12d
+; CHECK-NEXT: movl %edi, %ebx
+; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: movl %edi, %r13d
+; CHECK-NEXT: movl %edi, %r14d
+; CHECK-NEXT: movl %edi, %r8d
+; CHECK-NEXT: movl %edi, %r9d
+; CHECK-NEXT: movl %edi, %r10d
+; CHECK-NEXT: movl %edi, %r15d
+; CHECK-NEXT: movl %edi, %edx
+; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT: movzwl 2(%rsi), %edi
+; CHECK-NEXT: ## kill: def $di killed $di def $edi
+; CHECK-NEXT: testb $4, %r11b
; CHECK-NEXT: jne LBB12_7
; CHECK-NEXT: jmp LBB12_8
; CHECK-NEXT: LBB12_4:
-; CHECK-NEXT: vmovaps %xmm2, %xmm1
-; CHECK-NEXT: vmovaps %xmm2, %xmm7
-; CHECK-NEXT: vmovaps %xmm2, %xmm6
-; CHECK-NEXT: vmovaps %xmm2, %xmm5
-; CHECK-NEXT: vmovaps %xmm2, %xmm4
-; CHECK-NEXT: vmovaps %xmm2, %xmm3
-; CHECK-NEXT: vmovaps %xmm2, %xmm16
-; CHECK-NEXT: vmovaps %xmm2, %xmm15
-; CHECK-NEXT: vmovaps %xmm2, %xmm14
-; CHECK-NEXT: vmovaps %xmm2, %xmm13
-; CHECK-NEXT: vmovaps %xmm2, %xmm12
-; CHECK-NEXT: vmovaps %xmm2, %xmm11
-; CHECK-NEXT: vmovaps %xmm2, %xmm10
-; CHECK-NEXT: testb $4, %cl
+; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT: movl %edi, %r12d
+; CHECK-NEXT: movl %edi, %ebx
+; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: movl %edi, %r13d
+; CHECK-NEXT: movl %edi, %r14d
+; CHECK-NEXT: movl %edi, %r8d
+; CHECK-NEXT: movl %edi, %r9d
+; CHECK-NEXT: movl %edi, %r10d
+; CHECK-NEXT: movl %edi, %r15d
+; CHECK-NEXT: movl %edi, %edx
+; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT: testb $4, %r11b
; CHECK-NEXT: je LBB12_8
; CHECK-NEXT: LBB12_7: ## %cond.load4
-; CHECK-NEXT: movswl 4(%rsi), %edx
-; CHECK-NEXT: vmovd %edx, %xmm0
-; CHECK-NEXT: vcvtph2ps %xmm0, %xmm1
+; CHECK-NEXT: movzwl 4(%rsi), %ecx
+; CHECK-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
; CHECK-NEXT: LBB12_8: ## %else5
-; CHECK-NEXT: testb $8, %cl
+; CHECK-NEXT: testb $8, %r11b
; CHECK-NEXT: jne LBB12_9
; CHECK-NEXT: ## %bb.10: ## %else8
-; CHECK-NEXT: testb $16, %cl
+; CHECK-NEXT: testb $16, %r11b
; CHECK-NEXT: jne LBB12_11
; CHECK-NEXT: LBB12_12: ## %else11
-; CHECK-NEXT: testb $32, %cl
+; CHECK-NEXT: testb $32, %r11b
; CHECK-NEXT: jne LBB12_13
; CHECK-NEXT: LBB12_14: ## %else14
-; CHECK-NEXT: testb $64, %cl
+; CHECK-NEXT: testb $64, %r11b
; CHECK-NEXT: jne LBB12_15
; CHECK-NEXT: LBB12_16: ## %else17
-; CHECK-NEXT: testb $-128, %cl
+; CHECK-NEXT: testb $-128, %r11b
; CHECK-NEXT: jne LBB12_17
; CHECK-NEXT: LBB12_18: ## %else20
-; CHECK-NEXT: testl $256, %ecx ## imm = 0x100
+; CHECK-NEXT: testl $256, %r11d ## imm = 0x100
; CHECK-NEXT: jne LBB12_19
; CHECK-NEXT: LBB12_20: ## %else23
-; CHECK-NEXT: testl $512, %ecx ## imm = 0x200
+; CHECK-NEXT: testl $512, %r11d ## imm = 0x200
; CHECK-NEXT: jne LBB12_21
; CHECK-NEXT: LBB12_22: ## %else26
-; CHECK-NEXT: testl $1024, %ecx ## imm = 0x400
+; CHECK-NEXT: testl $1024, %r11d ## imm = 0x400
; CHECK-NEXT: jne LBB12_23
; CHECK-NEXT: LBB12_24: ## %else29
-; CHECK-NEXT: testl $2048, %ecx ## imm = 0x800
+; CHECK-NEXT: testl $2048, %r11d ## imm = 0x800
; CHECK-NEXT: jne LBB12_25
; CHECK-NEXT: LBB12_26: ## %else32
-; CHECK-NEXT: testl $4096, %ecx ## imm = 0x1000
-; CHECK-NEXT: jne LBB12_27
+; CHECK-NEXT: testl $4096, %r11d ## imm = 0x1000
+; CHECK-NEXT: je LBB12_28
+; CHECK-NEXT: LBB12_27: ## %cond.load34
+; CHECK-NEXT: movzwl 24(%rsi), %edx
; CHECK-NEXT: LBB12_28: ## %else35
-; CHECK-NEXT: testl $8192, %ecx ## imm = 0x2000
+; CHECK-NEXT: movw %dx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT: testl $8192, %r11d ## imm = 0x2000
; CHECK-NEXT: jne LBB12_29
-; CHECK-NEXT: LBB12_30: ## %else38
-; CHECK-NEXT: testl $16384, %ecx ## imm = 0x4000
+; CHECK-NEXT: ## %bb.30: ## %else38
+; CHECK-NEXT: testl $16384, %r11d ## imm = 0x4000
; CHECK-NEXT: jne LBB12_31
; CHECK-NEXT: LBB12_32: ## %else41
-; CHECK-NEXT: testl $32768, %ecx ## imm = 0x8000
-; CHECK-NEXT: je LBB12_34
-; CHECK-NEXT: LBB12_33: ## %cond.load43
-; CHECK-NEXT: movswl 30(%rsi), %ecx
-; CHECK-NEXT: vmovd %ecx, %xmm0
-; CHECK-NEXT: vcvtph2ps %xmm0, %xmm9
-; CHECK-NEXT: LBB12_34: ## %else44
-; CHECK-NEXT: vcvtps2ph $4, %xmm8, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
-; CHECK-NEXT: movw %cx, (%rax)
-; CHECK-NEXT: vcvtps2ph $4, %xmm2, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
-; CHECK-NEXT: movw %cx, 2(%rax)
-; CHECK-NEXT: vcvtps2ph $4, %xmm1, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
-; CHECK-NEXT: movw %cx, 4(%rax)
-; CHECK-NEXT: vcvtps2ph $4, %xmm7, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
-; CHECK-NEXT: movw %cx, 6(%rax)
-; CHECK-NEXT: vcvtps2ph $4, %xmm6, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
-; CHECK-NEXT: movw %cx, 8(%rax)
-; CHECK-NEXT: vcvtps2ph $4, %xmm5, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
-; CHECK-NEXT: movw %cx, 10(%rax)
-; CHECK-NEXT: vcvtps2ph $4, %xmm4, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
-; CHECK-NEXT: movw %cx, 12(%rax)
-; CHECK-NEXT: vcvtps2ph $4, %xmm3, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
-; CHECK-NEXT: movw %cx, 14(%rax)
-; CHECK-NEXT: vcvtps2ph $4, %xmm16, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
-; CHECK-NEXT: movw %cx, 16(%rax)
-; CHECK-NEXT: vcvtps2ph $4, %xmm15, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
-; CHECK-NEXT: movw %cx, 18(%rax)
-; CHECK-NEXT: vcvtps2ph $4, %xmm14, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
-; CHECK-NEXT: movw %cx, 20(%rax)
-; CHECK-NEXT: vcvtps2ph $4, %xmm13, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
-; CHECK-NEXT: movw %cx, 22(%rax)
-; CHECK-NEXT: vcvtps2ph $4, %xmm12, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
-; CHECK-NEXT: movw %cx, 24(%rax)
-; CHECK-NEXT: vcvtps2ph $4, %xmm11, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
-; CHECK-NEXT: movw %cx, 26(%rax)
-; CHECK-NEXT: vcvtps2ph $4, %xmm10, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
-; CHECK-NEXT: movw %cx, 28(%rax)
-; CHECK-NEXT: vcvtps2ph $4, %xmm9, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
-; CHECK-NEXT: movw %cx, 30(%rax)
-; CHECK-NEXT: retq
+; CHECK-NEXT: testl $32768, %r11d ## imm = 0x8000
+; CHECK-NEXT: je LBB12_33
+; CHECK-NEXT: LBB12_34: ## %cond.load43
+; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload
+; CHECK-NEXT: movzwl 30(%rsi), %esi
+; CHECK-NEXT: jmp LBB12_35
; CHECK-NEXT: LBB12_9: ## %cond.load7
-; CHECK-NEXT: movswl 6(%rsi), %edx
-; CHECK-NEXT: vmovd %edx, %xmm0
-; CHECK-NEXT: vcvtph2ps %xmm0, %xmm7
-; CHECK-NEXT: testb $16, %cl
+; CHECK-NEXT: movzwl 6(%rsi), %r12d
+; CHECK-NEXT: testb $16, %r11b
; CHECK-NEXT: je LBB12_12
; CHECK-NEXT: LBB12_11: ## %cond.load10
-; CHECK-NEXT: movswl 8(%rsi), %edx
-; CHECK-NEXT: vmovd %edx, %xmm0
-; CHECK-NEXT: vcvtph2ps %xmm0, %xmm6
-; CHECK-NEXT: testb $32, %cl
+; CHECK-NEXT: movzwl 8(%rsi), %ebx
+; CHECK-NEXT: testb $32, %r11b
; CHECK-NEXT: je LBB12_14
; CHECK-NEXT: LBB12_13: ## %cond.load13
-; CHECK-NEXT: movswl 10(%rsi), %edx
-; CHECK-NEXT: vmovd %edx, %xmm0
-; CHECK-NEXT: vcvtph2ps %xmm0, %xmm5
-; CHECK-NEXT: testb $64, %cl
+; CHECK-NEXT: movzwl 10(%rsi), %ebp
+; CHECK-NEXT: testb $64, %r11b
; CHECK-NEXT: je LBB12_16
; CHECK-NEXT: LBB12_15: ## %cond.load16
-; CHECK-NEXT: movswl 12(%rsi), %edx
-; CHECK-NEXT: vmovd %edx, %xmm0
-; CHECK-NEXT: vcvtph2ps %xmm0, %xmm4
-; CHECK-NEXT: testb $-128, %cl
+; CHECK-NEXT: movzwl 12(%rsi), %r13d
+; CHECK-NEXT: testb $-128, %r11b
; CHECK-NEXT: je LBB12_18
; CHECK-NEXT: LBB12_17: ## %cond.load19
-; CHECK-NEXT: movswl 14(%rsi), %edx
-; CHECK-NEXT: vmovd %edx, %xmm0
-; CHECK-NEXT: vcvtph2ps %xmm0, %xmm3
-; CHECK-NEXT: testl $256, %ecx ## imm = 0x100
+; CHECK-NEXT: movzwl 14(%rsi), %r14d
+; CHECK-NEXT: testl $256, %r11d ## imm = 0x100
; CHECK-NEXT: je LBB12_20
; CHECK-NEXT: LBB12_19: ## %cond.load22
-; CHECK-NEXT: movswl 16(%rsi), %edx
-; CHECK-NEXT: vmovd %edx, %xmm0
-; CHECK-NEXT: vcvtph2ps %xmm0, %xmm16
-; CHECK-NEXT: testl $512, %ecx ## imm = 0x200
+; CHECK-NEXT: movzwl 16(%rsi), %r8d
+; CHECK-NEXT: testl $512, %r11d ## imm = 0x200
; CHECK-NEXT: je LBB12_22
; CHECK-NEXT: LBB12_21: ## %cond.load25
-; CHECK-NEXT: movswl 18(%rsi), %edx
-; CHECK-NEXT: vmovd %edx, %xmm0
-; CHECK-NEXT: vcvtph2ps %xmm0, %xmm15
-; CHECK-NEXT: testl $1024, %ecx ## imm = 0x400
+; CHECK-NEXT: movzwl 18(%rsi), %r9d
+; CHECK-NEXT: testl $1024, %r11d ## imm = 0x400
; CHECK-NEXT: je LBB12_24
; CHECK-NEXT: LBB12_23: ## %cond.load28
-; CHECK-NEXT: movswl 20(%rsi), %edx
-; CHECK-NEXT: vmovd %edx, %xmm0
-; CHECK-NEXT: vcvtph2ps %xmm0, %xmm14
-; CHECK-NEXT: testl $2048, %ecx ## imm = 0x800
+; CHECK-NEXT: movzwl 20(%rsi), %r10d
+; CHECK-NEXT: testl $2048, %r11d ## imm = 0x800
; CHECK-NEXT: je LBB12_26
; CHECK-NEXT: LBB12_25: ## %cond.load31
-; CHECK-NEXT: movswl 22(%rsi), %edx
-; CHECK-NEXT: vmovd %edx, %xmm0
-; CHECK-NEXT: vcvtph2ps %xmm0, %xmm13
-; CHECK-NEXT: testl $4096, %ecx ## imm = 0x1000
-; CHECK-NEXT: je LBB12_28
-; CHECK-NEXT: LBB12_27: ## %cond.load34
-; CHECK-NEXT: movswl 24(%rsi), %edx
-; CHECK-NEXT: vmovd %edx, %xmm0
-; CHECK-NEXT: vcvtph2ps %xmm0, %xmm12
-; CHECK-NEXT: testl $8192, %ecx ## imm = 0x2000
-; CHECK-NEXT: je LBB12_30
+; CHECK-NEXT: movzwl 22(%rsi), %r15d
+; CHECK-NEXT: testl $4096, %r11d ## imm = 0x1000
+; CHECK-NEXT: jne LBB12_27
+; CHECK-NEXT: jmp LBB12_28
; CHECK-NEXT: LBB12_29: ## %cond.load37
-; CHECK-NEXT: movswl 26(%rsi), %edx
-; CHECK-NEXT: vmovd %edx, %xmm0
-; CHECK-NEXT: vcvtph2ps %xmm0, %xmm11
-; CHECK-NEXT: testl $16384, %ecx ## imm = 0x4000
+; CHECK-NEXT: movzwl 26(%rsi), %ecx
+; CHECK-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT: testl $16384, %r11d ## imm = 0x4000
; CHECK-NEXT: je LBB12_32
; CHECK-NEXT: LBB12_31: ## %cond.load40
-; CHECK-NEXT: movswl 28(%rsi), %edx
-; CHECK-NEXT: vmovd %edx, %xmm0
-; CHECK-NEXT: vcvtph2ps %xmm0, %xmm10
-; CHECK-NEXT: testl $32768, %ecx ## imm = 0x8000
-; CHECK-NEXT: jne LBB12_33
-; CHECK-NEXT: jmp LBB12_34
+; CHECK-NEXT: movzwl 28(%rsi), %ecx
+; CHECK-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT: testl $32768, %r11d ## imm = 0x8000
+; CHECK-NEXT: jne LBB12_34
+; CHECK-NEXT: LBB12_33:
+; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload
+; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi ## 4-byte Reload
+; CHECK-NEXT: LBB12_35: ## %else44
+; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx ## 4-byte Reload
+; CHECK-NEXT: movw %dx, (%rax)
+; CHECK-NEXT: movw %di, 2(%rax)
+; CHECK-NEXT: movw %cx, 4(%rax)
+; CHECK-NEXT: movw %r12w, 6(%rax)
+; CHECK-NEXT: movw %bx, 8(%rax)
+; CHECK-NEXT: movw %bp, 10(%rax)
+; CHECK-NEXT: movw %r13w, 12(%rax)
+; CHECK-NEXT: movw %r14w, 14(%rax)
+; CHECK-NEXT: movw %r8w, 16(%rax)
+; CHECK-NEXT: movw %r9w, 18(%rax)
+; CHECK-NEXT: movw %r10w, 20(%rax)
+; CHECK-NEXT: movw %r15w, 22(%rax)
+; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload
+; CHECK-NEXT: movw %cx, 24(%rax)
+; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload
+; CHECK-NEXT: movw %cx, 26(%rax)
+; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload
+; CHECK-NEXT: movw %cx, 28(%rax)
+; CHECK-NEXT: movw %si, 30(%rax)
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
%res = call <16 x half> @llvm.masked.load.v16f16(<16 x half>* %addr, i32 4, <16 x i1>%mask, <16 x half> zeroinitializer)
ret <16 x half> %res
}
@@ -440,107 +414,77 @@ define void @test_mask_store_16xf16(<16 x i1> %mask, <16 x half>* %addr, <16 x h
; CHECK-NEXT: LBB13_32: ## %else30
; CHECK-NEXT: retq
; CHECK-NEXT: LBB13_1: ## %cond.store
-; CHECK-NEXT: vcvtps2ph $4, %xmm1, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
-; CHECK-NEXT: movw %cx, (%rdi)
+; CHECK-NEXT: movw %si, (%rdi)
; CHECK-NEXT: testb $2, %al
; CHECK-NEXT: je LBB13_4
; CHECK-NEXT: LBB13_3: ## %cond.store1
-; CHECK-NEXT: vcvtps2ph $4, %xmm2, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
-; CHECK-NEXT: movw %cx, 2(%rdi)
+; CHECK-NEXT: movw %dx, 2(%rdi)
; CHECK-NEXT: testb $4, %al
; CHECK-NEXT: je LBB13_6
; CHECK-NEXT: LBB13_5: ## %cond.store3
-; CHECK-NEXT: vcvtps2ph $4, %xmm3, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
; CHECK-NEXT: movw %cx, 4(%rdi)
; CHECK-NEXT: testb $8, %al
; CHECK-NEXT: je LBB13_8
; CHECK-NEXT: LBB13_7: ## %cond.store5
-; CHECK-NEXT: vcvtps2ph $4, %xmm4, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
-; CHECK-NEXT: movw %cx, 6(%rdi)
+; CHECK-NEXT: movw %r8w, 6(%rdi)
; CHECK-NEXT: testb $16, %al
; CHECK-NEXT: je LBB13_10
; CHECK-NEXT: LBB13_9: ## %cond.store7
-; CHECK-NEXT: vcvtps2ph $4, %xmm5, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
-; CHECK-NEXT: movw %cx, 8(%rdi)
+; CHECK-NEXT: movw %r9w, 8(%rdi)
; CHECK-NEXT: testb $32, %al
; CHECK-NEXT: je LBB13_12
; CHECK-NEXT: LBB13_11: ## %cond.store9
-; CHECK-NEXT: vcvtps2ph $4, %xmm6, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
+; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movw %cx, 10(%rdi)
; CHECK-NEXT: testb $64, %al
; CHECK-NEXT: je LBB13_14
; CHECK-NEXT: LBB13_13: ## %cond.store11
-; CHECK-NEXT: vcvtps2ph $4, %xmm7, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
+; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movw %cx, 12(%rdi)
; CHECK-NEXT: testb $-128, %al
; CHECK-NEXT: je LBB13_16
; CHECK-NEXT: LBB13_15: ## %cond.store13
-; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
+; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movw %cx, 14(%rdi)
; CHECK-NEXT: testl $256, %eax ## imm = 0x100
; CHECK-NEXT: je LBB13_18
; CHECK-NEXT: LBB13_17: ## %cond.store15
-; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
+; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movw %cx, 16(%rdi)
; CHECK-NEXT: testl $512, %eax ## imm = 0x200
; CHECK-NEXT: je LBB13_20
; CHECK-NEXT: LBB13_19: ## %cond.store17
-; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
+; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movw %cx, 18(%rdi)
; CHECK-NEXT: testl $1024, %eax ## imm = 0x400
; CHECK-NEXT: je LBB13_22
; CHECK-NEXT: LBB13_21: ## %cond.store19
-; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
+; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movw %cx, 20(%rdi)
; CHECK-NEXT: testl $2048, %eax ## imm = 0x800
; CHECK-NEXT: je LBB13_24
; CHECK-NEXT: LBB13_23: ## %cond.store21
-; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
+; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movw %cx, 22(%rdi)
; CHECK-NEXT: testl $4096, %eax ## imm = 0x1000
; CHECK-NEXT: je LBB13_26
; CHECK-NEXT: LBB13_25: ## %cond.store23
-; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
+; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movw %cx, 24(%rdi)
; CHECK-NEXT: testl $8192, %eax ## imm = 0x2000
; CHECK-NEXT: je LBB13_28
; CHECK-NEXT: LBB13_27: ## %cond.store25
-; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
+; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movw %cx, 26(%rdi)
; CHECK-NEXT: testl $16384, %eax ## imm = 0x4000
; CHECK-NEXT: je LBB13_30
; CHECK-NEXT: LBB13_29: ## %cond.store27
-; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %ecx
+; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movw %cx, 28(%rdi)
; CHECK-NEXT: testl $32768, %eax ## imm = 0x8000
; CHECK-NEXT: je LBB13_32
; CHECK-NEXT: LBB13_31: ## %cond.store29
-; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %eax
+; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movw %ax, 30(%rdi)
; CHECK-NEXT: retq
call void @llvm.masked.store.v16f16.p0v16f16(<16 x half> %val, <16 x half>* %addr, i32 4, <16 x i1>%mask)
diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
index 7abcabe9c055..6ec5da46a5de 100644
--- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -1432,20 +1432,20 @@ define void @half_vec_compare(<2 x half>* %x, <2 x i8>* %y) {
; KNL: ## %bb.0: ## %entry
; KNL-NEXT: movzwl (%rdi), %eax ## encoding: [0x0f,0xb7,0x07]
; KNL-NEXT: movzwl 2(%rdi), %ecx ## encoding: [0x0f,0xb7,0x4f,0x02]
+; KNL-NEXT: movswl %cx, %ecx ## encoding: [0x0f,0xbf,0xc9]
+; KNL-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
+; KNL-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0]
+; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
+; KNL-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
+; KNL-NEXT: setp %cl ## encoding: [0x0f,0x9a,0xc1]
+; KNL-NEXT: setne %dl ## encoding: [0x0f,0x95,0xc2]
+; KNL-NEXT: orb %cl, %dl ## encoding: [0x08,0xca]
+; KNL-NEXT: kmovw %edx, %k0 ## encoding: [0xc5,0xf8,0x92,0xc2]
+; KNL-NEXT: kshiftlw $1, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x01]
; KNL-NEXT: cwtl ## encoding: [0x98]
; KNL-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
; KNL-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0]
-; KNL-NEXT: movswl %cx, %eax ## encoding: [0x0f,0xbf,0xc1]
-; KNL-NEXT: vmovd %eax, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc8]
-; KNL-NEXT: vcvtph2ps %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x79,0x13,0xc9]
-; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
-; KNL-NEXT: vucomiss %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xca]
-; KNL-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0]
-; KNL-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1]
-; KNL-NEXT: orb %al, %cl ## encoding: [0x08,0xc1]
-; KNL-NEXT: kmovw %ecx, %k0 ## encoding: [0xc5,0xf8,0x92,0xc1]
-; KNL-NEXT: kshiftlw $1, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x01]
-; KNL-NEXT: vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2]
+; KNL-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
; KNL-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0]
; KNL-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1]
; KNL-NEXT: orb %al, %cl ## encoding: [0x08,0xc1]
@@ -1465,20 +1465,20 @@ define void @half_vec_compare(<2 x half>* %x, <2 x i8>* %y) {
; AVX512BW: ## %bb.0: ## %entry
; AVX512BW-NEXT: movzwl (%rdi), %eax ## encoding: [0x0f,0xb7,0x07]
; AVX512BW-NEXT: movzwl 2(%rdi), %ecx ## encoding: [0x0f,0xb7,0x4f,0x02]
+; AVX512BW-NEXT: movswl %cx, %ecx ## encoding: [0x0f,0xbf,0xc9]
+; AVX512BW-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
+; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0]
+; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
+; AVX512BW-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
+; AVX512BW-NEXT: setp %cl ## encoding: [0x0f,0x9a,0xc1]
+; AVX512BW-NEXT: setne %dl ## encoding: [0x0f,0x95,0xc2]
+; AVX512BW-NEXT: orb %cl, %dl ## encoding: [0x08,0xca]
+; AVX512BW-NEXT: kmovd %edx, %k0 ## encoding: [0xc5,0xfb,0x92,0xc2]
+; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x01]
; AVX512BW-NEXT: cwtl ## encoding: [0x98]
; AVX512BW-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0]
-; AVX512BW-NEXT: movswl %cx, %eax ## encoding: [0x0f,0xbf,0xc1]
-; AVX512BW-NEXT: vmovd %eax, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc8]
-; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x79,0x13,0xc9]
-; AVX512BW-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
-; AVX512BW-NEXT: vucomiss %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xca]
-; AVX512BW-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0]
-; AVX512BW-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1]
-; AVX512BW-NEXT: orb %al, %cl ## encoding: [0x08,0xc1]
-; AVX512BW-NEXT: kmovd %ecx, %k0 ## encoding: [0xc5,0xfb,0x92,0xc1]
-; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x01]
-; AVX512BW-NEXT: vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2]
+; AVX512BW-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
; AVX512BW-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0]
; AVX512BW-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1]
; AVX512BW-NEXT: orb %al, %cl ## encoding: [0x08,0xc1]
@@ -1497,20 +1497,20 @@ define void @half_vec_compare(<2 x half>* %x, <2 x i8>* %y) {
; SKX: ## %bb.0: ## %entry
; SKX-NEXT: movzwl (%rdi), %eax ## encoding: [0x0f,0xb7,0x07]
; SKX-NEXT: movzwl 2(%rdi), %ecx ## encoding: [0x0f,0xb7,0x4f,0x02]
+; SKX-NEXT: movswl %cx, %ecx ## encoding: [0x0f,0xbf,0xc9]
+; SKX-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
+; SKX-NEXT: vcvtph2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0]
+; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
+; SKX-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
+; SKX-NEXT: setp %cl ## encoding: [0x0f,0x9a,0xc1]
+; SKX-NEXT: setne %dl ## encoding: [0x0f,0x95,0xc2]
+; SKX-NEXT: orb %cl, %dl ## encoding: [0x08,0xca]
+; SKX-NEXT: kmovd %edx, %k0 ## encoding: [0xc5,0xfb,0x92,0xc2]
+; SKX-NEXT: kshiftlb $1, %k0, %k0 ## encoding: [0xc4,0xe3,0x79,0x32,0xc0,0x01]
; SKX-NEXT: cwtl ## encoding: [0x98]
; SKX-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
; SKX-NEXT: vcvtph2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0]
-; SKX-NEXT: movswl %cx, %eax ## encoding: [0x0f,0xbf,0xc1]
-; SKX-NEXT: vmovd %eax, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc8]
-; SKX-NEXT: vcvtph2ps %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc9]
-; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x57,0xd2]
-; SKX-NEXT: vucomiss %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xca]
-; SKX-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0]
-; SKX-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1]
-; SKX-NEXT: orb %al, %cl ## encoding: [0x08,0xc1]
-; SKX-NEXT: kmovd %ecx, %k0 ## encoding: [0xc5,0xfb,0x92,0xc1]
-; SKX-NEXT: kshiftlb $1, %k0, %k0 ## encoding: [0xc4,0xe3,0x79,0x32,0xc0,0x01]
-; SKX-NEXT: vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2]
+; SKX-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
; SKX-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0]
; SKX-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1]
; SKX-NEXT: orb %al, %cl ## encoding: [0x08,0xc1]
diff --git a/llvm/test/CodeGen/X86/fmf-flags.ll b/llvm/test/CodeGen/X86/fmf-flags.ll
index c0ee182b64e9..b6893e6e60d1 100644
--- a/llvm/test/CodeGen/X86/fmf-flags.ll
+++ b/llvm/test/CodeGen/X86/fmf-flags.ll
@@ -111,25 +111,28 @@ define float @div_arcp_by_const(half %x) {
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: .cfi_def_cfa_offset 16
-; X64-NEXT: callq __gnu_f2h_ieee
-; X64-NEXT: movzwl %ax, %edi
+; X64-NEXT: movzwl %di, %edi
; X64-NEXT: callq __gnu_h2f_ieee
; X64-NEXT: mulss {{.*}}(%rip), %xmm0
+; X64-NEXT: callq __gnu_f2h_ieee
+; X64-NEXT: movzwl %ax, %edi
; X64-NEXT: popq %rax
; X64-NEXT: .cfi_def_cfa_offset 8
-; X64-NEXT: retq
+; X64-NEXT: jmp __gnu_h2f_ieee # TAILCALL
;
; X86-LABEL: div_arcp_by_const:
; X86: # %bb.0:
; X86-NEXT: pushl %eax
; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: flds {{[0-9]+}}(%esp)
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, (%esp)
+; X86-NEXT: calll __gnu_h2f_ieee
+; X86-NEXT: fmuls {{\.LCPI.*}}
; X86-NEXT: fstps (%esp)
; X86-NEXT: calll __gnu_f2h_ieee
; X86-NEXT: movzwl %ax, %eax
; X86-NEXT: movl %eax, (%esp)
; X86-NEXT: calll __gnu_h2f_ieee
-; X86-NEXT: fmuls {{\.LCPI.*}}
; X86-NEXT: popl %eax
; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index a6c92ecd78af..baf16825aacc 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -382,66 +382,94 @@ define void @test_uitofp_i64(i64 %a, half* %p) #0 {
define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
; CHECK-LIBCALL-LABEL: test_extend32_vec4:
; CHECK-LIBCALL: # %bb.0:
-; CHECK-LIBCALL-NEXT: pushq %rbx
-; CHECK-LIBCALL-NEXT: subq $48, %rsp
-; CHECK-LIBCALL-NEXT: movq %rdi, %rbx
-; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
-; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: subq $88, %rsp
+; CHECK-LIBCALL-NEXT: movl (%rdi), %eax
+; CHECK-LIBCALL-NEXT: movl 4(%rdi), %ecx
+; CHECK-LIBCALL-NEXT: movl %eax, {{[0-9]+}}(%rsp)
+; CHECK-LIBCALL-NEXT: movl %ecx, {{[0-9]+}}(%rsp)
+; CHECK-LIBCALL-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-LIBCALL-NEXT: movzwl 2(%rbx), %edi
+; CHECK-LIBCALL-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0
+; CHECK-LIBCALL-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT: pextrw $1, %xmm0, %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-LIBCALL-NEXT: movzwl 4(%rbx), %edi
+; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-LIBCALL-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
+; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-LIBCALL-NEXT: movzwl 6(%rbx), %edi
+; CHECK-LIBCALL-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-LIBCALL-NEXT: pextrw $1, %xmm0, %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
-; CHECK-LIBCALL-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; CHECK-LIBCALL-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-LIBCALL-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-LIBCALL-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-LIBCALL-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-LIBCALL-NEXT: addq $48, %rsp
-; CHECK-LIBCALL-NEXT: popq %rbx
+; CHECK-LIBCALL-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0]
+; CHECK-LIBCALL-NEXT: addq $88, %rsp
; CHECK-LIBCALL-NEXT: retq
;
; BWON-F16C-LABEL: test_extend32_vec4:
; BWON-F16C: # %bb.0:
-; BWON-F16C-NEXT: movswl 6(%rdi), %eax
-; BWON-F16C-NEXT: vmovd %eax, %xmm0
-; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
-; BWON-F16C-NEXT: movswl 4(%rdi), %eax
+; BWON-F16C-NEXT: movl (%rdi), %eax
+; BWON-F16C-NEXT: movl 4(%rdi), %ecx
+; BWON-F16C-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; BWON-F16C-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; BWON-F16C-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0
+; BWON-F16C-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1
+; BWON-F16C-NEXT: vpextrw $1, %xmm1, %eax
+; BWON-F16C-NEXT: cwtl
+; BWON-F16C-NEXT: vmovd %eax, %xmm2
+; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2
+; BWON-F16C-NEXT: vmovd %xmm1, %eax
+; BWON-F16C-NEXT: cwtl
; BWON-F16C-NEXT: vmovd %eax, %xmm1
; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
-; BWON-F16C-NEXT: movswl (%rdi), %eax
+; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; BWON-F16C-NEXT: vmovd %xmm0, %eax
+; BWON-F16C-NEXT: cwtl
; BWON-F16C-NEXT: vmovd %eax, %xmm2
; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2
-; BWON-F16C-NEXT: movswl 2(%rdi), %eax
-; BWON-F16C-NEXT: vmovd %eax, %xmm3
-; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm3
-; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; BWON-F16C-NEXT: vpextrw $1, %xmm0, %eax
+; BWON-F16C-NEXT: cwtl
+; BWON-F16C-NEXT: vmovd %eax, %xmm0
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: test_extend32_vec4:
; CHECK-I686: # %bb.0:
-; CHECK-I686-NEXT: pushl %esi
-; CHECK-I686-NEXT: subl $56, %esp
-; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
-; CHECK-I686-NEXT: movzwl 2(%esi), %eax
+; CHECK-I686-NEXT: subl $124, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movl (%eax), %ecx
+; CHECK-I686-NEXT: movl 4(%eax), %eax
+; CHECK-I686-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: movaps {{[0-9]+}}(%esp), %xmm0
+; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; CHECK-I686-NEXT: movdqa {{[0-9]+}}(%esp), %xmm0
+; CHECK-I686-NEXT: movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; CHECK-I686-NEXT: pextrw $1, %xmm0, %eax
; CHECK-I686-NEXT: movl %eax, (%esp)
; CHECK-I686-NEXT: calll __gnu_h2f_ieee
; CHECK-I686-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; CHECK-I686-NEXT: movzwl 4(%esi), %eax
+; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax
; CHECK-I686-NEXT: movl %eax, (%esp)
; CHECK-I686-NEXT: calll __gnu_h2f_ieee
; CHECK-I686-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; CHECK-I686-NEXT: movzwl 6(%esi), %eax
+; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: pextrw $1, %xmm0, %eax
; CHECK-I686-NEXT: movl %eax, (%esp)
; CHECK-I686-NEXT: calll __gnu_h2f_ieee
-; CHECK-I686-NEXT: movzwl (%esi), %eax
+; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax
; CHECK-I686-NEXT: movl %eax, (%esp)
; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
; CHECK-I686-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
@@ -457,8 +485,7 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-I686-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-I686-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-I686-NEXT: addl $56, %esp
-; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: addl $124, %esp
; CHECK-I686-NEXT: retl
%a = load <4 x half>, <4 x half>* %p, align 8
%b = fpext <4 x half> %a to <4 x float>
@@ -468,92 +495,97 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
define <4 x double> @test_extend64_vec4(<4 x half>* %p) #0 {
; CHECK-LIBCALL-LABEL: test_extend64_vec4:
; CHECK-LIBCALL: # %bb.0:
+; CHECK-LIBCALL-NEXT: pushq %rbp
+; CHECK-LIBCALL-NEXT: pushq %r14
; CHECK-LIBCALL-NEXT: pushq %rbx
-; CHECK-LIBCALL-NEXT: subq $16, %rsp
-; CHECK-LIBCALL-NEXT: movq %rdi, %rbx
-; CHECK-LIBCALL-NEXT: movzwl 4(%rdi), %edi
+; CHECK-LIBCALL-NEXT: subq $32, %rsp
+; CHECK-LIBCALL-NEXT: movzwl 4(%rdi), %r14d
+; CHECK-LIBCALL-NEXT: movzwl 6(%rdi), %ebp
+; CHECK-LIBCALL-NEXT: movzwl (%rdi), %ebx
+; CHECK-LIBCALL-NEXT: movzwl 2(%rdi), %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-LIBCALL-NEXT: movzwl 6(%rbx), %edi
+; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0
+; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT: movl %ebx, %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-LIBCALL-NEXT: movzwl (%rbx), %edi
+; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0
+; CHECK-LIBCALL-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0]
+; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT: movl %ebp, %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-LIBCALL-NEXT: movzwl 2(%rbx), %edi
+; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0
+; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-LIBCALL-NEXT: movl %r14d, %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm1
-; CHECK-LIBCALL-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-LIBCALL-NEXT: # xmm0 = mem[0],zero,zero,zero
-; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0
-; CHECK-LIBCALL-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-LIBCALL-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
-; CHECK-LIBCALL-NEXT: # xmm1 = mem[0],zero,zero,zero
-; CHECK-LIBCALL-NEXT: cvtss2sd %xmm1, %xmm2
-; CHECK-LIBCALL-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
-; CHECK-LIBCALL-NEXT: # xmm1 = mem[0],zero,zero,zero
-; CHECK-LIBCALL-NEXT: cvtss2sd %xmm1, %xmm1
-; CHECK-LIBCALL-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; CHECK-LIBCALL-NEXT: addq $16, %rsp
+; CHECK-LIBCALL-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-LIBCALL-NEXT: # xmm1 = xmm1[0],mem[0]
+; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-LIBCALL-NEXT: addq $32, %rsp
; CHECK-LIBCALL-NEXT: popq %rbx
+; CHECK-LIBCALL-NEXT: popq %r14
+; CHECK-LIBCALL-NEXT: popq %rbp
; CHECK-LIBCALL-NEXT: retq
;
; BWON-F16C-LABEL: test_extend64_vec4:
; BWON-F16C: # %bb.0:
-; BWON-F16C-NEXT: movswl (%rdi), %eax
+; BWON-F16C-NEXT: movswl 6(%rdi), %eax
; BWON-F16C-NEXT: vmovd %eax, %xmm0
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; BWON-F16C-NEXT: movswl 4(%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm1
+; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
+; BWON-F16C-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; BWON-F16C-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; BWON-F16C-NEXT: movswl 2(%rdi), %eax
; BWON-F16C-NEXT: vmovd %eax, %xmm1
; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
-; BWON-F16C-NEXT: movswl 4(%rdi), %eax
+; BWON-F16C-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; BWON-F16C-NEXT: movswl (%rdi), %eax
; BWON-F16C-NEXT: vmovd %eax, %xmm2
; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2
-; BWON-F16C-NEXT: movswl 6(%rdi), %eax
-; BWON-F16C-NEXT: vmovd %eax, %xmm3
-; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm3
-; BWON-F16C-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; BWON-F16C-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; BWON-F16C-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; BWON-F16C-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; BWON-F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; BWON-F16C-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; BWON-F16C-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; BWON-F16C-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; BWON-F16C-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: test_extend64_vec4:
; CHECK-I686: # %bb.0:
+; CHECK-I686-NEXT: pushl %ebx
+; CHECK-I686-NEXT: pushl %edi
; CHECK-I686-NEXT: pushl %esi
-; CHECK-I686-NEXT: subl $88, %esp
-; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
-; CHECK-I686-NEXT: movzwl 6(%esi), %eax
-; CHECK-I686-NEXT: movl %eax, (%esp)
-; CHECK-I686-NEXT: calll __gnu_h2f_ieee
-; CHECK-I686-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; CHECK-I686-NEXT: movzwl 4(%esi), %eax
+; CHECK-I686-NEXT: subl $64, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movzwl 6(%eax), %esi
+; CHECK-I686-NEXT: movzwl (%eax), %edi
+; CHECK-I686-NEXT: movzwl 2(%eax), %ebx
+; CHECK-I686-NEXT: movzwl 4(%eax), %eax
; CHECK-I686-NEXT: movl %eax, (%esp)
; CHECK-I686-NEXT: calll __gnu_h2f_ieee
; CHECK-I686-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; CHECK-I686-NEXT: movzwl 2(%esi), %eax
-; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: movl %ebx, (%esp)
; CHECK-I686-NEXT: calll __gnu_h2f_ieee
; CHECK-I686-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; CHECK-I686-NEXT: movzwl (%esi), %eax
-; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: movl %edi, (%esp)
; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: movl %esi, (%esp)
; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp)
; CHECK-I686-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp)
; CHECK-I686-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp)
; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-I686-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; CHECK-I686-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-I686-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; CHECK-I686-NEXT: addl $88, %esp
+; CHECK-I686-NEXT: addl $64, %esp
; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: popl %edi
+; CHECK-I686-NEXT: popl %ebx
; CHECK-I686-NEXT: retl
%a = load <4 x half>, <4 x half>* %p, align 8
%b = fpext <4 x half> %a to <4 x double>
@@ -843,9 +875,7 @@ define half @test_f80trunc_nodagcombine() #0 {
; CHECK-LIBCALL-NEXT: pushq %rax
; CHECK-LIBCALL-NEXT: callq test_floatret
; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT: movzwl %ax, %edi
-; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: popq %rax
+; CHECK-LIBCALL-NEXT: popq %rcx
; CHECK-LIBCALL-NEXT: retq
;
; BWON-F16C-LABEL: test_f80trunc_nodagcombine:
@@ -853,8 +883,9 @@ define half @test_f80trunc_nodagcombine() #0 {
; BWON-F16C-NEXT: pushq %rax
; BWON-F16C-NEXT: callq test_floatret
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
-; BWON-F16C-NEXT: popq %rax
+; BWON-F16C-NEXT: vmovd %xmm0, %eax
+; BWON-F16C-NEXT: # kill: def $ax killed $ax killed $eax
+; BWON-F16C-NEXT: popq %rcx
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: test_f80trunc_nodagcombine:
@@ -863,9 +894,6 @@ define half @test_f80trunc_nodagcombine() #0 {
; CHECK-I686-NEXT: calll test_floatret
; CHECK-I686-NEXT: fstps (%esp)
; CHECK-I686-NEXT: calll __gnu_f2h_ieee
-; CHECK-I686-NEXT: movzwl %ax, %eax
-; CHECK-I686-NEXT: movl %eax, (%esp)
-; CHECK-I686-NEXT: calll __gnu_h2f_ieee
; CHECK-I686-NEXT: addl $12, %esp
; CHECK-I686-NEXT: retl
%1 = call float @test_floatret()
@@ -881,54 +909,62 @@ define float @test_sitofp_fadd_i32(i32 %a, half* %b) #0 {
; CHECK-LIBCALL: # %bb.0:
; CHECK-LIBCALL-NEXT: pushq %rbx
; CHECK-LIBCALL-NEXT: subq $16, %rsp
-; CHECK-LIBCALL-NEXT: movl %edi, %ebx
-; CHECK-LIBCALL-NEXT: movzwl (%rsi), %edi
-; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-LIBCALL-NEXT: cvtsi2ss %ebx, %xmm0
+; CHECK-LIBCALL-NEXT: movzwl (%rsi), %ebx
+; CHECK-LIBCALL-NEXT: cvtsi2ss %edi, %xmm0
; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
; CHECK-LIBCALL-NEXT: movzwl %ax, %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-LIBCALL-NEXT: movl %ebx, %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
; CHECK-LIBCALL-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT: movzwl %ax, %edi
; CHECK-LIBCALL-NEXT: addq $16, %rsp
; CHECK-LIBCALL-NEXT: popq %rbx
-; CHECK-LIBCALL-NEXT: retq
+; CHECK-LIBCALL-NEXT: jmp __gnu_h2f_ieee # TAILCALL
;
; BWON-F16C-LABEL: test_sitofp_fadd_i32:
; BWON-F16C: # %bb.0:
-; BWON-F16C-NEXT: movswl (%rsi), %eax
-; BWON-F16C-NEXT: vmovd %eax, %xmm0
+; BWON-F16C-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
-; BWON-F16C-NEXT: vcvtsi2ss %edi, %xmm1, %xmm1
-; BWON-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; BWON-F16C-NEXT: movswl (%rsi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm1
; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
-; BWON-F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; BWON-F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: test_sitofp_fadd_i32:
; CHECK-I686: # %bb.0:
-; CHECK-I686-NEXT: subl $28, %esp
+; CHECK-I686-NEXT: pushl %edi
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $20, %esp
; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-I686-NEXT: movzwl (%eax), %eax
+; CHECK-I686-NEXT: movzwl (%eax), %edi
+; CHECK-I686-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
+; CHECK-I686-NEXT: movss %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movw %ax, %si
+; CHECK-I686-NEXT: movl %edi, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: movzwl %si, %eax
; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
; CHECK-I686-NEXT: calll __gnu_h2f_ieee
; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-I686-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-I686-NEXT: xorps %xmm0, %xmm0
-; CHECK-I686-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
+; CHECK-I686-NEXT: addss {{[0-9]+}}(%esp), %xmm0
; CHECK-I686-NEXT: movss %xmm0, (%esp)
; CHECK-I686-NEXT: calll __gnu_f2h_ieee
; CHECK-I686-NEXT: movzwl %ax, %eax
; CHECK-I686-NEXT: movl %eax, (%esp)
; CHECK-I686-NEXT: calll __gnu_h2f_ieee
-; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-I686-NEXT: # xmm0 = mem[0],zero,zero,zero
-; CHECK-I686-NEXT: addss {{[0-9]+}}(%esp), %xmm0
-; CHECK-I686-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT: flds {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT: addl $28, %esp
+; CHECK-I686-NEXT: addl $20, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: popl %edi
; CHECK-I686-NEXT: retl
%tmp0 = load half, half* %b
%tmp1 = sitofp i32 %a to half
@@ -941,58 +977,47 @@ define half @PR40273(half) #0 {
; CHECK-LIBCALL-LABEL: PR40273:
; CHECK-LIBCALL: # %bb.0:
; CHECK-LIBCALL-NEXT: pushq %rax
-; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT: movzwl %ax, %edi
+; CHECK-LIBCALL-NEXT: movzwl %di, %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: xorl %eax, %eax
; CHECK-LIBCALL-NEXT: xorps %xmm1, %xmm1
; CHECK-LIBCALL-NEXT: ucomiss %xmm1, %xmm0
-; CHECK-LIBCALL-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-LIBCALL-NEXT: jne .LBB17_3
-; CHECK-LIBCALL-NEXT: # %bb.1:
-; CHECK-LIBCALL-NEXT: jp .LBB17_3
-; CHECK-LIBCALL-NEXT: # %bb.2:
-; CHECK-LIBCALL-NEXT: xorps %xmm0, %xmm0
-; CHECK-LIBCALL-NEXT: .LBB17_3:
-; CHECK-LIBCALL-NEXT: popq %rax
+; CHECK-LIBCALL-NEXT: movl $15360, %ecx # imm = 0x3C00
+; CHECK-LIBCALL-NEXT: cmovnel %ecx, %eax
+; CHECK-LIBCALL-NEXT: cmovpl %ecx, %eax
+; CHECK-LIBCALL-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-LIBCALL-NEXT: popq %rcx
; CHECK-LIBCALL-NEXT: retq
;
; BWON-F16C-LABEL: PR40273:
; BWON-F16C: # %bb.0:
-; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; BWON-F16C-NEXT: movswl %di, %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm0
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: xorl %eax, %eax
; BWON-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
; BWON-F16C-NEXT: vucomiss %xmm1, %xmm0
-; BWON-F16C-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; BWON-F16C-NEXT: jne .LBB17_3
-; BWON-F16C-NEXT: # %bb.1:
-; BWON-F16C-NEXT: jp .LBB17_3
-; BWON-F16C-NEXT: # %bb.2:
-; BWON-F16C-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; BWON-F16C-NEXT: .LBB17_3:
+; BWON-F16C-NEXT: movl $15360, %ecx # imm = 0x3C00
+; BWON-F16C-NEXT: cmovnel %ecx, %eax
+; BWON-F16C-NEXT: cmovpl %ecx, %eax
+; BWON-F16C-NEXT: # kill: def $ax killed $ax killed $eax
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: PR40273:
; CHECK-I686: # %bb.0:
; CHECK-I686-NEXT: subl $12, %esp
-; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-I686-NEXT: movss %xmm0, (%esp)
-; CHECK-I686-NEXT: calll __gnu_f2h_ieee
-; CHECK-I686-NEXT: movzwl %ax, %eax
+; CHECK-I686-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; CHECK-I686-NEXT: movl %eax, (%esp)
; CHECK-I686-NEXT: calll __gnu_h2f_ieee
; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: xorl %eax, %eax
; CHECK-I686-NEXT: xorps %xmm1, %xmm1
; CHECK-I686-NEXT: ucomiss %xmm1, %xmm0
-; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-I686-NEXT: jne .LBB17_3
-; CHECK-I686-NEXT: # %bb.1:
-; CHECK-I686-NEXT: jp .LBB17_3
-; CHECK-I686-NEXT: # %bb.2:
-; CHECK-I686-NEXT: xorps %xmm0, %xmm0
-; CHECK-I686-NEXT: .LBB17_3:
-; CHECK-I686-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT: flds {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: movl $15360, %ecx # imm = 0x3C00
+; CHECK-I686-NEXT: cmovnel %ecx, %eax
+; CHECK-I686-NEXT: cmovpl %ecx, %eax
+; CHECK-I686-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-I686-NEXT: addl $12, %esp
; CHECK-I686-NEXT: retl
%2 = fcmp une half %0, 0xH0000
diff --git a/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll b/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll
index 851828612420..fcce4e77905a 100644
--- a/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll
+++ b/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll
@@ -17,7 +17,6 @@ define x86_mmx @mxcsr_mmx(<4 x float> %a0) {
define half @mxcsr_f16c(float %a) {
; CHECK: VCVTPS2PH{{.*}}mxcsr
-; CHECK: VCVTPH2PS{{.*}}mxcsr
%res = fptrunc float %a to half
ret half %res
}
diff --git a/llvm/test/CodeGen/X86/pr31088.ll b/llvm/test/CodeGen/X86/pr31088.ll
index f443ff417cc9..258912407df3 100644
--- a/llvm/test/CodeGen/X86/pr31088.ll
+++ b/llvm/test/CodeGen/X86/pr31088.ll
@@ -6,55 +6,52 @@
define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind {
; X86-LABEL: ir_fadd_v1f16:
; X86: # %bb.0:
-; X86-NEXT: subl $28, %esp
-; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: calll __gnu_f2h_ieee
-; X86-NEXT: movzwl %ax, %eax
+; X86-NEXT: pushl %esi
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, (%esp)
; X86-NEXT: calll __gnu_h2f_ieee
-; X86-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
-; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: calll __gnu_f2h_ieee
-; X86-NEXT: movzwl %ax, %eax
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; X86-NEXT: movl %esi, (%esp)
; X86-NEXT: fstps {{[0-9]+}}(%esp)
; X86-NEXT: calll __gnu_h2f_ieee
; X86-NEXT: fstps {{[0-9]+}}(%esp)
; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: addss {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: flds {{[0-9]+}}(%esp)
-; X86-NEXT: addl $28, %esp
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: calll __gnu_f2h_ieee
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: popl %esi
; X86-NEXT: retl
;
; X64-LABEL: ir_fadd_v1f16:
; X64: # %bb.0:
-; X64-NEXT: pushq %rax
-; X64-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT: movaps %xmm1, %xmm0
-; X64-NEXT: callq __gnu_f2h_ieee
-; X64-NEXT: movzwl %ax, %edi
+; X64-NEXT: pushq %rbx
+; X64-NEXT: subq $16, %rsp
+; X64-NEXT: movl %edi, %ebx
+; X64-NEXT: movzwl %si, %edi
; X64-NEXT: callq __gnu_h2f_ieee
-; X64-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
-; X64-NEXT: movss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload
-; X64-NEXT: # xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: callq __gnu_f2h_ieee
-; X64-NEXT: movzwl %ax, %edi
+; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT: movzwl %bx, %edi
; X64-NEXT: callq __gnu_h2f_ieee
-; X64-NEXT: addss (%rsp), %xmm0 # 4-byte Folded Reload
-; X64-NEXT: popq %rax
+; X64-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; X64-NEXT: callq __gnu_f2h_ieee
+; X64-NEXT: addq $16, %rsp
+; X64-NEXT: popq %rbx
; X64-NEXT: retq
;
; F16C-LABEL: ir_fadd_v1f16:
; F16C: # %bb.0:
-; F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; F16C-NEXT: movswl %si, %eax
+; F16C-NEXT: vmovd %eax, %xmm0
+; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT: movswl %di, %eax
+; F16C-NEXT: vmovd %eax, %xmm1
; F16C-NEXT: vcvtph2ps %xmm1, %xmm1
+; F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; F16C-NEXT: vmovd %xmm0, %eax
+; F16C-NEXT: # kill: def $ax killed $ax killed $eax
; F16C-NEXT: retq
%retval = fadd <1 x half> %arg0, %arg1
ret <1 x half> %retval
@@ -63,99 +60,118 @@ define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind {
define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind {
; X86-LABEL: ir_fadd_v2f16:
; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: calll __gnu_f2h_ieee
-; X86-NEXT: movzwl %ax, %eax
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $80, %esp
+; X86-NEXT: movzwl 8(%ebp), %esi
+; X86-NEXT: movzwl 12(%ebp), %edi
+; X86-NEXT: movzwl 20(%ebp), %ebx
+; X86-NEXT: movzwl 16(%ebp), %eax
; X86-NEXT: movl %eax, (%esp)
; X86-NEXT: calll __gnu_h2f_ieee
-; X86-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
-; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: calll __gnu_f2h_ieee
-; X86-NEXT: movzwl %ax, %eax
-; X86-NEXT: movl %eax, (%esp)
+; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT: movl %ebx, (%esp)
; X86-NEXT: calll __gnu_h2f_ieee
-; X86-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
-; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: calll __gnu_f2h_ieee
-; X86-NEXT: movzwl %ax, %eax
-; X86-NEXT: movl %eax, (%esp)
+; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT: movl %edi, (%esp)
; X86-NEXT: calll __gnu_h2f_ieee
-; X86-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
-; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: calll __gnu_f2h_ieee
-; X86-NEXT: movzwl %ax, %eax
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; X86-NEXT: movl %esi, (%esp)
; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
; X86-NEXT: fstps {{[0-9]+}}(%esp)
; X86-NEXT: calll __gnu_h2f_ieee
+; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: addss {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT: movss %xmm0, (%esp)
; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: calll __gnu_f2h_ieee
; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: addss {{[0-9]+}}(%esp), %xmm1
; X86-NEXT: addss {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movss %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: flds {{[0-9]+}}(%esp)
-; X86-NEXT: flds {{[0-9]+}}(%esp)
-; X86-NEXT: addl $64, %esp
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: movw %ax, {{[0-9]+}}(%esp)
+; X86-NEXT: calll __gnu_f2h_ieee
+; X86-NEXT: movw %ax, {{[0-9]+}}(%esp)
+; X86-NEXT: movdqa {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT: movd %xmm0, %eax
+; X86-NEXT: pextrw $1, %xmm0, %edx
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: # kill: def $dx killed $dx killed $edx
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: ir_fadd_v2f16:
; X64: # %bb.0:
-; X64-NEXT: subq $24, %rsp
-; X64-NEXT: movss %xmm2, {{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT: movss %xmm1, {{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT: movaps %xmm3, %xmm0
-; X64-NEXT: callq __gnu_f2h_ieee
-; X64-NEXT: movzwl %ax, %edi
+; X64-NEXT: pushq %rbp
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %rbx
+; X64-NEXT: subq $32, %rsp
+; X64-NEXT: movl %edx, %ebx
+; X64-NEXT: movl %esi, %ebp
+; X64-NEXT: movl %edi, %r14d
+; X64-NEXT: movzwl %cx, %edi
; X64-NEXT: callq __gnu_h2f_ieee
-; X64-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT: movss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload
-; X64-NEXT: # xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: callq __gnu_f2h_ieee
-; X64-NEXT: movzwl %ax, %edi
+; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT: movzwl %bp, %edi
; X64-NEXT: callq __gnu_h2f_ieee
-; X64-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT: movss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload
-; X64-NEXT: # xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; X64-NEXT: callq __gnu_f2h_ieee
-; X64-NEXT: movzwl %ax, %edi
+; X64-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; X64-NEXT: movzwl %bx, %edi
; X64-NEXT: callq __gnu_h2f_ieee
-; X64-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT: movss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload
-; X64-NEXT: # xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: callq __gnu_f2h_ieee
-; X64-NEXT: movzwl %ax, %edi
+; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT: movzwl %r14w, %edi
; X64-NEXT: callq __gnu_h2f_ieee
-; X64-NEXT: addss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload
-; X64-NEXT: movss {{[0-9]+}}(%rsp), %xmm1 # 4-byte Reload
-; X64-NEXT: # xmm1 = mem[0],zero,zero,zero
-; X64-NEXT: addss {{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload
-; X64-NEXT: addq $24, %rsp
+; X64-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; X64-NEXT: callq __gnu_f2h_ieee
+; X64-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; X64-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: pextrw $1, %xmm0, %edx
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: # kill: def $dx killed $dx killed $edx
+; X64-NEXT: addq $32, %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %r14
+; X64-NEXT: popq %rbp
; X64-NEXT: retq
;
; F16C-LABEL: ir_fadd_v2f16:
; F16C: # %bb.0:
-; F16C-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; F16C-NEXT: vcvtph2ps %xmm3, %xmm3
-; F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; F16C-NEXT: movswl %cx, %eax
+; F16C-NEXT: vmovd %eax, %xmm0
+; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT: movswl %si, %eax
+; F16C-NEXT: vmovd %eax, %xmm1
; F16C-NEXT: vcvtph2ps %xmm1, %xmm1
-; F16C-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; F16C-NEXT: vcvtph2ps %xmm2, %xmm2
+; F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT: vmovd %xmm0, %eax
+; F16C-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; F16C-NEXT: movswl %dx, %eax
+; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; F16C-NEXT: vaddss %xmm3, %xmm1, %xmm1
+; F16C-NEXT: movswl %di, %eax
+; F16C-NEXT: vmovd %eax, %xmm1
+; F16C-NEXT: vcvtph2ps %xmm1, %xmm1
+; F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT: vmovd %xmm0, %eax
+; F16C-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; F16C-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0
+; F16C-NEXT: vmovd %xmm0, %eax
+; F16C-NEXT: vpextrw $1, %xmm0, %edx
+; F16C-NEXT: # kill: def $ax killed $ax killed $eax
+; F16C-NEXT: # kill: def $dx killed $dx killed $edx
; F16C-NEXT: retq
%retval = fadd <2 x half> %arg0, %arg1
ret <2 x half> %retval
diff --git a/llvm/test/CodeGen/X86/pr38533.ll b/llvm/test/CodeGen/X86/pr38533.ll
index 59c67acc9be6..74b8f0c101f7 100644
--- a/llvm/test/CodeGen/X86/pr38533.ll
+++ b/llvm/test/CodeGen/X86/pr38533.ll
@@ -14,22 +14,10 @@ define void @constant_fold_vector_to_half() {
; Similarly this makes sure that the opposite bitcast of the above is also legalized without crashing.
define void @pr38533_2(half %x) {
-; SSE-LABEL: pr38533_2:
-; SSE: # %bb.0:
-; SSE-NEXT: pushq %rax
-; SSE-NEXT: .cfi_def_cfa_offset 16
-; SSE-NEXT: callq __gnu_f2h_ieee
-; SSE-NEXT: movw %ax, (%rax)
-; SSE-NEXT: popq %rax
-; SSE-NEXT: .cfi_def_cfa_offset 8
-; SSE-NEXT: retq
-;
-; AVX512-LABEL: pr38533_2:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: movw %ax, (%rax)
-; AVX512-NEXT: retq
+; CHECK-LABEL: pr38533_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movw %di, (%rax)
+; CHECK-NEXT: retq
%a = bitcast half %x to <4 x i4>
store volatile <4 x i4> %a, <4 x i4>* undef
ret void
@@ -37,22 +25,10 @@ define void @pr38533_2(half %x) {
; This case is a bitcast from fp16 to a 16-bit wide legal vector type. In this case the result type is legal when the bitcast gets type legalized.
define void @pr38533_3(half %x) {
-; SSE-LABEL: pr38533_3:
-; SSE: # %bb.0:
-; SSE-NEXT: pushq %rax
-; SSE-NEXT: .cfi_def_cfa_offset 16
-; SSE-NEXT: callq __gnu_f2h_ieee
-; SSE-NEXT: movw %ax, (%rax)
-; SSE-NEXT: popq %rax
-; SSE-NEXT: .cfi_def_cfa_offset 8
-; SSE-NEXT: retq
-;
-; AVX512-LABEL: pr38533_3:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: movw %ax, (%rax)
-; AVX512-NEXT: retq
+; CHECK-LABEL: pr38533_3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movw %di, (%rax)
+; CHECK-NEXT: retq
%a = bitcast half %x to <16 x i1>
store volatile <16 x i1> %a, <16 x i1>* undef
ret void
diff --git a/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll b/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll
index 02ec475b05df..6ddb225108fc 100644
--- a/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll
+++ b/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll
@@ -4,22 +4,28 @@
define void @f(<4 x half>* %a, <4 x half>* %b, <8 x half>* %c) {
; CHECK-LABEL: f:
; CHECK: # %bb.0:
-; CHECK-NEXT: movzwl (%rdi), %r8d
-; CHECK-NEXT: movzwl 2(%rdi), %r9d
+; CHECK-NEXT: movzwl (%rdi), %eax
+; CHECK-NEXT: movzwl 2(%rdi), %ecx
+; CHECK-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movzwl 6(%rdi), %r8d
; CHECK-NEXT: movzwl 4(%rdi), %r11d
-; CHECK-NEXT: movzwl 6(%rdi), %edi
-; CHECK-NEXT: movzwl (%rsi), %r10d
-; CHECK-NEXT: movzwl 2(%rsi), %ecx
-; CHECK-NEXT: movzwl 4(%rsi), %eax
-; CHECK-NEXT: movzwl 6(%rsi), %esi
-; CHECK-NEXT: movw %si, 14(%rdx)
-; CHECK-NEXT: movw %di, 12(%rdx)
-; CHECK-NEXT: movw %ax, 10(%rdx)
+; CHECK-NEXT: movq (%rsi), %rsi
+; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT: pextrw $1, %xmm0, %r9d
+; CHECK-NEXT: movd %xmm0, %r10d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi
+; CHECK-NEXT: pextrw $3, %xmm0, %eax
+; CHECK-NEXT: pextrw $2, %xmm0, %edi
; CHECK-NEXT: movw %r11w, 8(%rdx)
-; CHECK-NEXT: movw %cx, 6(%rdx)
-; CHECK-NEXT: movw %r9w, 4(%rdx)
+; CHECK-NEXT: movw %cx, 4(%rdx)
+; CHECK-NEXT: movw %r8w, 12(%rdx)
+; CHECK-NEXT: movw %si, (%rdx)
+; CHECK-NEXT: movw %di, 10(%rdx)
+; CHECK-NEXT: movw %ax, 14(%rdx)
; CHECK-NEXT: movw %r10w, 2(%rdx)
-; CHECK-NEXT: movw %r8w, (%rdx)
+; CHECK-NEXT: movw %r9w, 6(%rdx)
; CHECK-NEXT: retq
%tmp4 = load <4 x half>, <4 x half>* %a
%tmp5 = load <4 x half>, <4 x half>* %b
diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
index bf2ea5e067cc..95204084385a 100644
--- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
@@ -2153,58 +2153,56 @@ define <8 x i32> @fptoui_8f32_to_8i32_const(<8 x float> %a) {
define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
; SSE-LABEL: fptosi_2f16_to_4i32:
; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: pushq %rbx
; SSE-NEXT: pushq %rax
-; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE-NEXT: movaps %xmm1, %xmm0
-; SSE-NEXT: callq __gnu_f2h_ieee
-; SSE-NEXT: movzwl %ax, %edi
+; SSE-NEXT: movl %esi, %ebx
+; SSE-NEXT: movzwl %di, %edi
; SSE-NEXT: callq __gnu_h2f_ieee
-; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
-; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: callq __gnu_f2h_ieee
-; SSE-NEXT: movzwl %ax, %edi
+; SSE-NEXT: cvttss2si %xmm0, %ebp
+; SSE-NEXT: movzwl %bx, %edi
; SSE-NEXT: callq __gnu_h2f_ieee
; SSE-NEXT: cvttss2si %xmm0, %eax
-; SSE-NEXT: cvttss2si (%rsp), %ecx # 4-byte Folded Reload
-; SSE-NEXT: movd %ecx, %xmm0
-; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: movd %ebp, %xmm1
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
-; SSE-NEXT: popq %rax
+; SSE-NEXT: addq $8, %rsp
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; VEX-LABEL: fptosi_2f16_to_4i32:
; VEX: # %bb.0:
+; VEX-NEXT: pushq %rbp
+; VEX-NEXT: pushq %rbx
; VEX-NEXT: pushq %rax
-; VEX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; VEX-NEXT: vmovaps %xmm1, %xmm0
-; VEX-NEXT: callq __gnu_f2h_ieee
-; VEX-NEXT: movzwl %ax, %edi
+; VEX-NEXT: movl %esi, %ebx
+; VEX-NEXT: movzwl %di, %edi
; VEX-NEXT: callq __gnu_h2f_ieee
-; VEX-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill
-; VEX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; VEX-NEXT: # xmm0 = mem[0],zero,zero,zero
-; VEX-NEXT: callq __gnu_f2h_ieee
-; VEX-NEXT: movzwl %ax, %edi
+; VEX-NEXT: vcvttss2si %xmm0, %ebp
+; VEX-NEXT: movzwl %bx, %edi
; VEX-NEXT: callq __gnu_h2f_ieee
; VEX-NEXT: vcvttss2si %xmm0, %eax
-; VEX-NEXT: vcvttss2si (%rsp), %ecx # 4-byte Folded Reload
-; VEX-NEXT: vmovd %ecx, %xmm0
-; VEX-NEXT: vmovd %eax, %xmm1
+; VEX-NEXT: vmovd %eax, %xmm0
+; VEX-NEXT: vmovd %ebp, %xmm1
; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; VEX-NEXT: popq %rax
+; VEX-NEXT: addq $8, %rsp
+; VEX-NEXT: popq %rbx
+; VEX-NEXT: popq %rbp
; VEX-NEXT: retq
;
; AVX512-LABEL: fptosi_2f16_to_4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: movswl %di, %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vcvttss2si %xmm0, %eax
-; AVX512-NEXT: vcvttss2si %xmm1, %ecx
+; AVX512-NEXT: movswl %si, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vcvttss2si %xmm0, %ecx
; AVX512-NEXT: vmovd %ecx, %xmm0
; AVX512-NEXT: vmovd %eax, %xmm1
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index 4895c06972e3..4d0fb57adb09 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -26,26 +26,25 @@ define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind {
; ALL: # %bb.0:
; ALL-NEXT: vmovq %xmm0, %rax
; ALL-NEXT: movq %rax, %rcx
-; ALL-NEXT: movq %rax, %rdx
-; ALL-NEXT: movswl %ax, %esi
-; ALL-NEXT: # kill: def $eax killed $eax killed $rax
-; ALL-NEXT: shrl $16, %eax
; ALL-NEXT: shrq $32, %rcx
-; ALL-NEXT: shrq $48, %rdx
-; ALL-NEXT: movswl %dx, %edx
+; ALL-NEXT: movswl %ax, %edx
; ALL-NEXT: vmovd %edx, %xmm0
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movl %eax, %edx
+; ALL-NEXT: shrl $16, %edx
+; ALL-NEXT: movswl %dx, %edx
+; ALL-NEXT: vmovd %edx, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; ALL-NEXT: movswl %cx, %ecx
; ALL-NEXT: vmovd %ecx, %xmm1
; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; ALL-NEXT: shrq $48, %rax
; ALL-NEXT: cwtl
-; ALL-NEXT: vmovd %eax, %xmm2
-; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
-; ALL-NEXT: vmovd %esi, %xmm3
-; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
-; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; ALL-NEXT: retq
%1 = bitcast <4 x i16> %a0 to <4 x half>
%2 = fpext <4 x half> %1 to <4 x float>
@@ -57,26 +56,25 @@ define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind {
; ALL: # %bb.0:
; ALL-NEXT: vmovq %xmm0, %rax
; ALL-NEXT: movq %rax, %rcx
-; ALL-NEXT: movq %rax, %rdx
-; ALL-NEXT: movswl %ax, %esi
-; ALL-NEXT: # kill: def $eax killed $eax killed $rax
-; ALL-NEXT: shrl $16, %eax
; ALL-NEXT: shrq $32, %rcx
-; ALL-NEXT: shrq $48, %rdx
-; ALL-NEXT: movswl %dx, %edx
+; ALL-NEXT: movswl %ax, %edx
; ALL-NEXT: vmovd %edx, %xmm0
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movl %eax, %edx
+; ALL-NEXT: shrl $16, %edx
+; ALL-NEXT: movswl %dx, %edx
+; ALL-NEXT: vmovd %edx, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; ALL-NEXT: movswl %cx, %ecx
; ALL-NEXT: vmovd %ecx, %xmm1
; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; ALL-NEXT: shrq $48, %rax
; ALL-NEXT: cwtl
-; ALL-NEXT: vmovd %eax, %xmm2
-; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
-; ALL-NEXT: vmovd %esi, %xmm3
-; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
-; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; ALL-NEXT: retq
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = bitcast <4 x i16> %1 to <4 x half>
@@ -87,51 +85,49 @@ define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind {
define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind {
; ALL-LABEL: cvt_8i16_to_8f32:
; ALL: # %bb.0:
+; ALL-NEXT: vmovq %xmm0, %rax
+; ALL-NEXT: movq %rax, %rcx
+; ALL-NEXT: shrq $32, %rcx
; ALL-NEXT: vpextrq $1, %xmm0, %rdx
-; ALL-NEXT: movq %rdx, %r8
-; ALL-NEXT: movq %rdx, %r10
-; ALL-NEXT: movswl %dx, %r9d
-; ALL-NEXT: # kill: def $edx killed $edx killed $rdx
-; ALL-NEXT: shrl $16, %edx
-; ALL-NEXT: shrq $32, %r8
-; ALL-NEXT: shrq $48, %r10
-; ALL-NEXT: vmovq %xmm0, %rdi
-; ALL-NEXT: movq %rdi, %rax
-; ALL-NEXT: movq %rdi, %rsi
-; ALL-NEXT: movswl %di, %ecx
-; ALL-NEXT: # kill: def $edi killed $edi killed $rdi
+; ALL-NEXT: movq %rdx, %rsi
+; ALL-NEXT: shrq $32, %rsi
+; ALL-NEXT: movswl %dx, %edi
+; ALL-NEXT: vmovd %edi, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movl %edx, %edi
; ALL-NEXT: shrl $16, %edi
-; ALL-NEXT: shrq $32, %rax
-; ALL-NEXT: shrq $48, %rsi
+; ALL-NEXT: movswl %di, %edi
+; ALL-NEXT: vmovd %edi, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; ALL-NEXT: movswl %si, %esi
-; ALL-NEXT: vmovd %esi, %xmm0
-; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
-; ALL-NEXT: cwtl
-; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vmovd %esi, %xmm1
; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
-; ALL-NEXT: movswl %di, %eax
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; ALL-NEXT: shrq $48, %rdx
+; ALL-NEXT: movswl %dx, %edx
+; ALL-NEXT: vmovd %edx, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; ALL-NEXT: movswl %ax, %edx
+; ALL-NEXT: vmovd %edx, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movl %eax, %edx
+; ALL-NEXT: shrl $16, %edx
+; ALL-NEXT: movswl %dx, %edx
+; ALL-NEXT: vmovd %edx, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; ALL-NEXT: movswl %cx, %ecx
+; ALL-NEXT: vmovd %ecx, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; ALL-NEXT: shrq $48, %rax
+; ALL-NEXT: cwtl
; ALL-NEXT: vmovd %eax, %xmm2
; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
-; ALL-NEXT: vmovd %ecx, %xmm3
-; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
-; ALL-NEXT: movswl %r10w, %eax
-; ALL-NEXT: vmovd %eax, %xmm4
-; ALL-NEXT: vcvtph2ps %xmm4, %xmm4
-; ALL-NEXT: movswl %r8w, %eax
-; ALL-NEXT: vmovd %eax, %xmm5
-; ALL-NEXT: vcvtph2ps %xmm5, %xmm5
-; ALL-NEXT: movswl %dx, %eax
-; ALL-NEXT: vmovd %eax, %xmm6
-; ALL-NEXT: vcvtph2ps %xmm6, %xmm6
-; ALL-NEXT: vmovd %r9d, %xmm7
-; ALL-NEXT: vcvtph2ps %xmm7, %xmm7
-; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
-; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; ALL-NEXT: retq
%1 = bitcast <8 x i16> %a0 to <8 x half>
%2 = fpext <8 x half> %1 to <8 x float>
@@ -141,385 +137,277 @@ define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind {
define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
; AVX1-LABEL: cvt_16i16_to_16f32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vmovq %xmm4, %rax
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq $48, %rcx
-; AVX1-NEXT: movswl %cx, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm8
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq $32, %rcx
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovq %xmm1, %r10
+; AVX1-NEXT: movq %r10, %r8
+; AVX1-NEXT: shrq $32, %r8
+; AVX1-NEXT: vpextrq $1, %xmm1, %rdx
+; AVX1-NEXT: movq %rdx, %r9
+; AVX1-NEXT: shrq $32, %r9
+; AVX1-NEXT: vmovq %xmm0, %rdi
+; AVX1-NEXT: movq %rdi, %r11
+; AVX1-NEXT: shrq $32, %r11
+; AVX1-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX1-NEXT: movq %rsi, %rax
+; AVX1-NEXT: shrq $32, %rax
+; AVX1-NEXT: movswl %si, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX1-NEXT: movl %esi, %ecx
+; AVX1-NEXT: shrl $16, %ecx
; AVX1-NEXT: movswl %cx, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm9
-; AVX1-NEXT: movswl %ax, %ecx
-; AVX1-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: vmovd %ecx, %xmm1
+; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX1-NEXT: cwtl
-; AVX1-NEXT: vmovd %eax, %xmm10
-; AVX1-NEXT: vpextrq $1, %xmm4, %rax
-; AVX1-NEXT: vmovd %ecx, %xmm11
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq $48, %rcx
-; AVX1-NEXT: movswl %cx, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm12
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq $32, %rcx
-; AVX1-NEXT: movswl %cx, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm13
-; AVX1-NEXT: movswl %ax, %ecx
-; AVX1-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; AVX1-NEXT: shrq $48, %rsi
+; AVX1-NEXT: movswl %si, %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX1-NEXT: movswl %di, %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
; AVX1-NEXT: shrl $16, %eax
; AVX1-NEXT: cwtl
-; AVX1-NEXT: vmovd %eax, %xmm14
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vmovd %ecx, %xmm15
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq $48, %rcx
-; AVX1-NEXT: movswl %cx, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm2
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq $32, %rcx
-; AVX1-NEXT: movswl %cx, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm3
-; AVX1-NEXT: movswl %ax, %ecx
-; AVX1-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX1-NEXT: vmovd %eax, %xmm2
+; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; AVX1-NEXT: movswl %r11w, %eax
+; AVX1-NEXT: vmovd %eax, %xmm2
+; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX1-NEXT: shrq $48, %rdi
+; AVX1-NEXT: movswl %di, %eax
+; AVX1-NEXT: vmovd %eax, %xmm2
+; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: movswl %dx, %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX1-NEXT: movl %edx, %eax
; AVX1-NEXT: shrl $16, %eax
; AVX1-NEXT: cwtl
-; AVX1-NEXT: vmovd %eax, %xmm4
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: vmovd %ecx, %xmm0
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq $48, %rcx
-; AVX1-NEXT: movswl %cx, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm5
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq $32, %rcx
-; AVX1-NEXT: movswl %cx, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm6
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $16, %ecx
-; AVX1-NEXT: movswl %cx, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm7
-; AVX1-NEXT: cwtl
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vcvtph2ps %xmm8, %xmm8
-; AVX1-NEXT: vcvtph2ps %xmm9, %xmm9
-; AVX1-NEXT: vcvtph2ps %xmm10, %xmm10
-; AVX1-NEXT: vcvtph2ps %xmm11, %xmm11
-; AVX1-NEXT: vcvtph2ps %xmm12, %xmm12
-; AVX1-NEXT: vcvtph2ps %xmm13, %xmm13
-; AVX1-NEXT: vcvtph2ps %xmm14, %xmm14
-; AVX1-NEXT: vcvtph2ps %xmm15, %xmm15
+; AVX1-NEXT: vmovd %eax, %xmm2
+; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; AVX1-NEXT: movswl %r9w, %eax
+; AVX1-NEXT: vmovd %eax, %xmm2
+; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX1-NEXT: shrq $48, %rdx
+; AVX1-NEXT: movswl %dx, %eax
+; AVX1-NEXT: vmovd %eax, %xmm2
; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVX1-NEXT: movswl %r10w, %eax
+; AVX1-NEXT: vmovd %eax, %xmm2
+; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX1-NEXT: movl %r10d, %eax
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm3
; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; AVX1-NEXT: movswl %r8w, %eax
+; AVX1-NEXT: vmovd %eax, %xmm3
+; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; AVX1-NEXT: shrq $48, %r10
+; AVX1-NEXT: movswl %r10w, %eax
+; AVX1-NEXT: vmovd %eax, %xmm3
+; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_16i16_to_16f32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vmovq %xmm4, %rax
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq $48, %rcx
-; AVX2-NEXT: movswl %cx, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm8
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq $32, %rcx
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovq %xmm1, %r10
+; AVX2-NEXT: movq %r10, %r8
+; AVX2-NEXT: shrq $32, %r8
+; AVX2-NEXT: vpextrq $1, %xmm1, %rdx
+; AVX2-NEXT: movq %rdx, %r9
+; AVX2-NEXT: shrq $32, %r9
+; AVX2-NEXT: vmovq %xmm0, %rdi
+; AVX2-NEXT: movq %rdi, %r11
+; AVX2-NEXT: shrq $32, %r11
+; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT: movq %rsi, %rax
+; AVX2-NEXT: shrq $32, %rax
+; AVX2-NEXT: movswl %si, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: shrl $16, %ecx
; AVX2-NEXT: movswl %cx, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm9
-; AVX2-NEXT: movswl %ax, %ecx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: vmovd %ecx, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX2-NEXT: cwtl
-; AVX2-NEXT: vmovd %eax, %xmm10
-; AVX2-NEXT: vpextrq $1, %xmm4, %rax
-; AVX2-NEXT: vmovd %ecx, %xmm11
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq $48, %rcx
-; AVX2-NEXT: movswl %cx, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm12
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq $32, %rcx
-; AVX2-NEXT: movswl %cx, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm13
-; AVX2-NEXT: movswl %ax, %ecx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; AVX2-NEXT: shrq $48, %rsi
+; AVX2-NEXT: movswl %si, %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX2-NEXT: movswl %di, %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
; AVX2-NEXT: shrl $16, %eax
; AVX2-NEXT: cwtl
-; AVX2-NEXT: vmovd %eax, %xmm14
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vmovd %ecx, %xmm15
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq $48, %rcx
-; AVX2-NEXT: movswl %cx, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm2
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq $32, %rcx
-; AVX2-NEXT: movswl %cx, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm3
-; AVX2-NEXT: movswl %ax, %ecx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; AVX2-NEXT: movswl %r11w, %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX2-NEXT: shrq $48, %rdi
+; AVX2-NEXT: movswl %di, %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movswl %dx, %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: shrl $16, %eax
; AVX2-NEXT: cwtl
-; AVX2-NEXT: vmovd %eax, %xmm4
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: vmovd %ecx, %xmm0
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq $48, %rcx
-; AVX2-NEXT: movswl %cx, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm5
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq $32, %rcx
-; AVX2-NEXT: movswl %cx, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm6
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $16, %ecx
-; AVX2-NEXT: movswl %cx, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm7
-; AVX2-NEXT: cwtl
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm8, %xmm8
-; AVX2-NEXT: vcvtph2ps %xmm9, %xmm9
-; AVX2-NEXT: vcvtph2ps %xmm10, %xmm10
-; AVX2-NEXT: vcvtph2ps %xmm11, %xmm11
-; AVX2-NEXT: vcvtph2ps %xmm12, %xmm12
-; AVX2-NEXT: vcvtph2ps %xmm13, %xmm13
-; AVX2-NEXT: vcvtph2ps %xmm14, %xmm14
-; AVX2-NEXT: vcvtph2ps %xmm15, %xmm15
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; AVX2-NEXT: movswl %r9w, %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX2-NEXT: shrq $48, %rdx
+; AVX2-NEXT: movswl %dx, %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVX2-NEXT: movswl %r10w, %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: movl %r10d, %eax
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm3
; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; AVX2-NEXT: movswl %r8w, %eax
+; AVX2-NEXT: vmovd %eax, %xmm3
+; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; AVX2-NEXT: shrq $48, %r10
+; AVX2-NEXT: movswl %r10w, %eax
+; AVX2-NEXT: vmovd %eax, %xmm3
+; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: cvt_16i16_to_16f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm10
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: movq %rax, %rcx
-; AVX512F-NEXT: shrq $48, %rcx
-; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm8
-; AVX512F-NEXT: movq %rax, %rcx
-; AVX512F-NEXT: shrq $32, %rcx
-; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm9
-; AVX512F-NEXT: movswl %ax, %ecx
-; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512F-NEXT: shrl $16, %eax
-; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm11
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vmovd %ecx, %xmm12
-; AVX512F-NEXT: movq %rax, %rcx
-; AVX512F-NEXT: shrq $48, %rcx
-; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm13
-; AVX512F-NEXT: movq %rax, %rcx
-; AVX512F-NEXT: shrq $32, %rcx
-; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm14
-; AVX512F-NEXT: movswl %ax, %ecx
-; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512F-NEXT: shrl $16, %eax
-; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm15
-; AVX512F-NEXT: vmovq %xmm10, %rax
-; AVX512F-NEXT: vmovd %ecx, %xmm2
-; AVX512F-NEXT: movq %rax, %rcx
-; AVX512F-NEXT: shrq $48, %rcx
-; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm3
-; AVX512F-NEXT: movq %rax, %rcx
-; AVX512F-NEXT: shrq $32, %rcx
-; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: movswl %ax, %ecx
-; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512F-NEXT: shrl $16, %eax
-; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vpextrq $1, %xmm10, %rax
-; AVX512F-NEXT: vmovd %ecx, %xmm10
-; AVX512F-NEXT: movq %rax, %rcx
-; AVX512F-NEXT: shrq $48, %rcx
-; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm5
-; AVX512F-NEXT: movq %rax, %rcx
-; AVX512F-NEXT: shrq $32, %rcx
-; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm6
-; AVX512F-NEXT: movl %eax, %ecx
-; AVX512F-NEXT: shrl $16, %ecx
-; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm7
-; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %xmm8, %xmm8
-; AVX512F-NEXT: vcvtph2ps %xmm9, %xmm9
-; AVX512F-NEXT: vcvtph2ps %xmm11, %xmm11
-; AVX512F-NEXT: vcvtph2ps %xmm12, %xmm12
-; AVX512F-NEXT: vcvtph2ps %xmm13, %xmm13
-; AVX512F-NEXT: vcvtph2ps %xmm14, %xmm14
-; AVX512F-NEXT: vcvtph2ps %xmm15, %xmm15
-; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512F-NEXT: vcvtph2ps %xmm10, %xmm10
-; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_16i16_to_16f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm10
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: movq %rax, %rcx
-; AVX512VL-NEXT: shrq $48, %rcx
-; AVX512VL-NEXT: movswl %cx, %ecx
-; AVX512VL-NEXT: vmovd %ecx, %xmm8
-; AVX512VL-NEXT: movq %rax, %rcx
-; AVX512VL-NEXT: shrq $32, %rcx
-; AVX512VL-NEXT: movswl %cx, %ecx
-; AVX512VL-NEXT: vmovd %ecx, %xmm9
-; AVX512VL-NEXT: movswl %ax, %ecx
-; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512VL-NEXT: shrl $16, %eax
-; AVX512VL-NEXT: cwtl
-; AVX512VL-NEXT: vmovd %eax, %xmm11
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vmovd %ecx, %xmm12
-; AVX512VL-NEXT: movq %rax, %rcx
-; AVX512VL-NEXT: shrq $48, %rcx
-; AVX512VL-NEXT: movswl %cx, %ecx
-; AVX512VL-NEXT: vmovd %ecx, %xmm13
-; AVX512VL-NEXT: movq %rax, %rcx
-; AVX512VL-NEXT: shrq $32, %rcx
-; AVX512VL-NEXT: movswl %cx, %ecx
-; AVX512VL-NEXT: vmovd %ecx, %xmm14
-; AVX512VL-NEXT: movswl %ax, %ecx
-; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512VL-NEXT: shrl $16, %eax
-; AVX512VL-NEXT: cwtl
-; AVX512VL-NEXT: vmovd %eax, %xmm15
-; AVX512VL-NEXT: vmovq %xmm10, %rax
-; AVX512VL-NEXT: vmovd %ecx, %xmm16
-; AVX512VL-NEXT: movq %rax, %rcx
-; AVX512VL-NEXT: shrq $48, %rcx
-; AVX512VL-NEXT: movswl %cx, %ecx
-; AVX512VL-NEXT: vmovd %ecx, %xmm17
-; AVX512VL-NEXT: movq %rax, %rcx
-; AVX512VL-NEXT: shrq $32, %rcx
-; AVX512VL-NEXT: movswl %cx, %ecx
-; AVX512VL-NEXT: vmovd %ecx, %xmm18
-; AVX512VL-NEXT: movswl %ax, %ecx
-; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512VL-NEXT: shrl $16, %eax
-; AVX512VL-NEXT: cwtl
-; AVX512VL-NEXT: vmovd %eax, %xmm19
-; AVX512VL-NEXT: vpextrq $1, %xmm10, %rax
-; AVX512VL-NEXT: vmovd %ecx, %xmm10
-; AVX512VL-NEXT: movq %rax, %rcx
-; AVX512VL-NEXT: shrq $48, %rcx
-; AVX512VL-NEXT: movswl %cx, %ecx
-; AVX512VL-NEXT: vmovd %ecx, %xmm20
-; AVX512VL-NEXT: movq %rax, %rcx
-; AVX512VL-NEXT: shrq $32, %rcx
-; AVX512VL-NEXT: movswl %cx, %ecx
-; AVX512VL-NEXT: vmovd %ecx, %xmm21
-; AVX512VL-NEXT: movl %eax, %ecx
-; AVX512VL-NEXT: shrl $16, %ecx
-; AVX512VL-NEXT: movswl %cx, %ecx
-; AVX512VL-NEXT: vmovd %ecx, %xmm22
-; AVX512VL-NEXT: cwtl
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm8, %xmm8
-; AVX512VL-NEXT: vcvtph2ps %xmm9, %xmm9
-; AVX512VL-NEXT: vcvtph2ps %xmm11, %xmm11
-; AVX512VL-NEXT: vcvtph2ps %xmm12, %xmm12
-; AVX512VL-NEXT: vcvtph2ps %xmm13, %xmm13
-; AVX512VL-NEXT: vcvtph2ps %xmm14, %xmm14
-; AVX512VL-NEXT: vcvtph2ps %xmm15, %xmm15
-; AVX512VL-NEXT: vcvtph2ps %xmm16, %xmm16
-; AVX512VL-NEXT: vcvtph2ps %xmm17, %xmm4
-; AVX512VL-NEXT: vcvtph2ps %xmm18, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm19, %xmm5
-; AVX512VL-NEXT: vcvtph2ps %xmm10, %xmm7
-; AVX512VL-NEXT: vcvtph2ps %xmm20, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm21, %xmm6
-; AVX512VL-NEXT: vcvtph2ps %xmm22, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm16[0],xmm15[0],xmm16[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: cvt_16i16_to_16f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovq %xmm0, %r10
+; AVX512-NEXT: movq %r10, %r8
+; AVX512-NEXT: shrq $32, %r8
+; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512-NEXT: movq %rdx, %r9
+; AVX512-NEXT: shrq $32, %r9
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovq %xmm0, %rdi
+; AVX512-NEXT: movq %rdi, %r11
+; AVX512-NEXT: shrq $32, %r11
+; AVX512-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512-NEXT: movq %rsi, %rax
+; AVX512-NEXT: shrq $32, %rax
+; AVX512-NEXT: movswl %si, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: movl %esi, %ecx
+; AVX512-NEXT: shrl $16, %ecx
+; AVX512-NEXT: movswl %cx, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; AVX512-NEXT: cwtl
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; AVX512-NEXT: shrq $48, %rsi
+; AVX512-NEXT: movswl %si, %eax
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX512-NEXT: movswl %di, %eax
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: movl %edi, %eax
+; AVX512-NEXT: shrl $16, %eax
+; AVX512-NEXT: cwtl
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; AVX512-NEXT: movswl %r11w, %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX512-NEXT: shrq $48, %rdi
+; AVX512-NEXT: movswl %di, %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT: movswl %dx, %eax
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: movl %edx, %eax
+; AVX512-NEXT: shrl $16, %eax
+; AVX512-NEXT: cwtl
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; AVX512-NEXT: movswl %r9w, %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX512-NEXT: shrq $48, %rdx
+; AVX512-NEXT: movswl %dx, %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVX512-NEXT: movswl %r10w, %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: movl %r10d, %eax
+; AVX512-NEXT: shrl $16, %eax
+; AVX512-NEXT: cwtl
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; AVX512-NEXT: movswl %r8w, %eax
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; AVX512-NEXT: shrq $48, %r10
+; AVX512-NEXT: movswl %r10w, %eax
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT: retq
%1 = bitcast <16 x i16> %a0 to <16 x half>
%2 = fpext <16 x half> %1 to <16 x float>
ret <16 x float> %2
@@ -545,20 +433,30 @@ define float @load_cvt_i16_to_f32(i16* %a0) nounwind {
define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind {
; ALL-LABEL: load_cvt_4i16_to_4f32:
; ALL: # %bb.0:
-; ALL-NEXT: movswl 6(%rdi), %eax
-; ALL-NEXT: vmovd %eax, %xmm0
-; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
-; ALL-NEXT: movswl 4(%rdi), %eax
+; ALL-NEXT: movl (%rdi), %eax
+; ALL-NEXT: movl 4(%rdi), %ecx
+; ALL-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; ALL-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0
+; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1
+; ALL-NEXT: vpextrw $1, %xmm1, %eax
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: cwtl
; ALL-NEXT: vmovd %eax, %xmm1
; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
-; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; ALL-NEXT: vmovd %xmm0, %eax
+; ALL-NEXT: cwtl
; ALL-NEXT: vmovd %eax, %xmm2
; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
-; ALL-NEXT: movswl 2(%rdi), %eax
-; ALL-NEXT: vmovd %eax, %xmm3
-; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
-; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; ALL-NEXT: vpextrw $1, %xmm0, %eax
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; ALL-NEXT: retq
%1 = load <4 x i16>, <4 x i16>* %a0
@@ -572,26 +470,25 @@ define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind {
; ALL: # %bb.0:
; ALL-NEXT: movq (%rdi), %rax
; ALL-NEXT: movq %rax, %rcx
-; ALL-NEXT: movq %rax, %rdx
-; ALL-NEXT: movswl %ax, %esi
-; ALL-NEXT: # kill: def $eax killed $eax killed $rax
-; ALL-NEXT: shrl $16, %eax
; ALL-NEXT: shrq $32, %rcx
-; ALL-NEXT: shrq $48, %rdx
-; ALL-NEXT: movswl %dx, %edx
+; ALL-NEXT: movswl %ax, %edx
; ALL-NEXT: vmovd %edx, %xmm0
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movl %eax, %edx
+; ALL-NEXT: shrl $16, %edx
+; ALL-NEXT: movswl %dx, %edx
+; ALL-NEXT: vmovd %edx, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; ALL-NEXT: movswl %cx, %ecx
; ALL-NEXT: vmovd %ecx, %xmm1
; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; ALL-NEXT: shrq $48, %rax
; ALL-NEXT: cwtl
-; ALL-NEXT: vmovd %eax, %xmm2
-; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
-; ALL-NEXT: vmovd %esi, %xmm3
-; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
-; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; ALL-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -603,37 +500,57 @@ define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind {
define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind {
; ALL-LABEL: load_cvt_8i16_to_8f32:
; ALL: # %bb.0:
-; ALL-NEXT: movswl 6(%rdi), %eax
-; ALL-NEXT: vmovd %eax, %xmm0
-; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
-; ALL-NEXT: movswl 4(%rdi), %eax
-; ALL-NEXT: vmovd %eax, %xmm1
-; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
-; ALL-NEXT: movswl (%rdi), %eax
-; ALL-NEXT: vmovd %eax, %xmm2
-; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
-; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: movl (%rdi), %eax
+; ALL-NEXT: movl 4(%rdi), %ecx
+; ALL-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; ALL-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; ALL-NEXT: movl 12(%rdi), %eax
+; ALL-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; ALL-NEXT: movl 8(%rdi), %eax
+; ALL-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0
+; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1
+; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2
+; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3
+; ALL-NEXT: vpextrw $1, %xmm3, %eax
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm4
+; ALL-NEXT: vcvtph2ps %xmm4, %xmm4
+; ALL-NEXT: vmovd %xmm3, %eax
+; ALL-NEXT: cwtl
; ALL-NEXT: vmovd %eax, %xmm3
; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
-; ALL-NEXT: movswl 14(%rdi), %eax
+; ALL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
+; ALL-NEXT: vmovd %xmm2, %eax
+; ALL-NEXT: cwtl
; ALL-NEXT: vmovd %eax, %xmm4
; ALL-NEXT: vcvtph2ps %xmm4, %xmm4
-; ALL-NEXT: movswl 12(%rdi), %eax
-; ALL-NEXT: vmovd %eax, %xmm5
-; ALL-NEXT: vcvtph2ps %xmm5, %xmm5
-; ALL-NEXT: movswl 8(%rdi), %eax
-; ALL-NEXT: vmovd %eax, %xmm6
-; ALL-NEXT: vcvtph2ps %xmm6, %xmm6
-; ALL-NEXT: movswl 10(%rdi), %eax
-; ALL-NEXT: vmovd %eax, %xmm7
-; ALL-NEXT: vcvtph2ps %xmm7, %xmm7
-; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
-; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
+; ALL-NEXT: vpextrw $1, %xmm2, %eax
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0]
+; ALL-NEXT: vpextrw $1, %xmm1, %eax
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2,3]
+; ALL-NEXT: vmovd %xmm0, %eax
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; ALL-NEXT: vpextrw $1, %xmm0, %eax
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; ALL-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
%2 = bitcast <8 x i16> %1 to <8 x half>
@@ -644,268 +561,436 @@ define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind {
define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind {
; AVX1-LABEL: load_cvt_16i16_to_16f32:
; AVX1: # %bb.0:
-; AVX1-NEXT: movswl 22(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm8
-; AVX1-NEXT: movswl 20(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm9
-; AVX1-NEXT: movswl 16(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm10
-; AVX1-NEXT: movswl 18(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm11
-; AVX1-NEXT: movswl 30(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm12
-; AVX1-NEXT: movswl 28(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm13
-; AVX1-NEXT: movswl 24(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm14
-; AVX1-NEXT: movswl 26(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm15
-; AVX1-NEXT: movswl 6(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: movswl 4(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm2
-; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm3
-; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT: movswl 2(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm4
-; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX1-NEXT: movswl 14(%rdi), %eax
+; AVX1-NEXT: pushq %rax
+; AVX1-NEXT: movl 20(%rdi), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movl 16(%rdi), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movl 28(%rdi), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movl 24(%rdi), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movl (%rdi), %eax
+; AVX1-NEXT: movl 4(%rdi), %ecx
+; AVX1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movl 12(%rdi), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movl 8(%rdi), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm8
+; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2
+; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3
+; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4
+; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0
+; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5
+; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6
+; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm7
+; AVX1-NEXT: vpextrw $1, %xmm7, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX1-NEXT: vmovd %xmm7, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm7
+; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
+; AVX1-NEXT: vmovd %xmm6, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm7
+; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0],xmm1[3]
+; AVX1-NEXT: vpextrw $1, %xmm6, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm6
+; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[0]
+; AVX1-NEXT: vpextrw $1, %xmm5, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm6
+; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX1-NEXT: vmovd %xmm5, %eax
+; AVX1-NEXT: cwtl
; AVX1-NEXT: vmovd %eax, %xmm5
; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX1-NEXT: movswl 12(%rdi), %eax
+; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3]
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: cwtl
; AVX1-NEXT: vmovd %eax, %xmm6
; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX1-NEXT: movswl 8(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm7
-; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX1-NEXT: movswl 10(%rdi), %eax
+; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0],xmm5[3]
+; AVX1-NEXT: vpextrw $1, %xmm0, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpextrw $1, %xmm4, %eax
+; AVX1-NEXT: cwtl
; AVX1-NEXT: vmovd %eax, %xmm1
; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
+; AVX1-NEXT: vmovd %xmm4, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm4
+; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3]
+; AVX1-NEXT: vmovd %xmm3, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm4
+; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
+; AVX1-NEXT: vpextrw $1, %xmm3, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm3
+; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
+; AVX1-NEXT: vpextrw $1, %xmm2, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm3
+; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm2, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm2
+; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; AVX1-NEXT: vmovd %xmm8, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm3
+; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; AVX1-NEXT: vpextrw $1, %xmm8, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm3
+; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: popq %rax
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_cvt_16i16_to_16f32:
; AVX2: # %bb.0:
-; AVX2-NEXT: movswl 22(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm8
-; AVX2-NEXT: movswl 20(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm9
-; AVX2-NEXT: movswl 16(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm10
-; AVX2-NEXT: movswl 18(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm11
-; AVX2-NEXT: movswl 30(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm12
-; AVX2-NEXT: movswl 28(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm13
-; AVX2-NEXT: movswl 24(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm14
-; AVX2-NEXT: movswl 26(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm15
-; AVX2-NEXT: movswl 6(%rdi), %eax
+; AVX2-NEXT: pushq %rax
+; AVX2-NEXT: movl 20(%rdi), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl 16(%rdi), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl 28(%rdi), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl 24(%rdi), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl (%rdi), %eax
+; AVX2-NEXT: movl 4(%rdi), %ecx
+; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl 12(%rdi), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl 8(%rdi), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm8
+; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2
+; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3
+; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4
+; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0
+; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5
+; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6
+; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm7
+; AVX2-NEXT: vpextrw $1, %xmm7, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: vmovd %xmm7, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm7
+; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
+; AVX2-NEXT: vmovd %xmm6, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm7
+; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0],xmm1[3]
+; AVX2-NEXT: vpextrw $1, %xmm6, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm6
+; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[0]
+; AVX2-NEXT: vpextrw $1, %xmm5, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm6
+; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX2-NEXT: vmovd %xmm5, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm5
+; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3]
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm6
+; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0],xmm5[3]
+; AVX2-NEXT: vpextrw $1, %xmm0, %eax
+; AVX2-NEXT: cwtl
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: movswl 4(%rdi), %eax
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpextrw $1, %xmm4, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: vmovd %xmm4, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm4
+; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3]
+; AVX2-NEXT: vmovd %xmm3, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm4
+; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
+; AVX2-NEXT: vpextrw $1, %xmm3, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm3
+; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
+; AVX2-NEXT: vpextrw $1, %xmm2, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm3
+; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: cwtl
; AVX2-NEXT: vmovd %eax, %xmm2
; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT: movswl (%rdi), %eax
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; AVX2-NEXT: vmovd %xmm8, %eax
+; AVX2-NEXT: cwtl
; AVX2-NEXT: vmovd %eax, %xmm3
; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT: movswl 2(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm4
-; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX2-NEXT: movswl 14(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm5
-; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX2-NEXT: movswl 12(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm6
-; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX2-NEXT: movswl 8(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm7
-; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX2-NEXT: movswl 10(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; AVX2-NEXT: vpextrw $1, %xmm8, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm3
+; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT: popq %rax
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_cvt_16i16_to_16f32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: movswl 6(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm8
-; AVX512F-NEXT: movswl 4(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm9
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm10
-; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm11
-; AVX512F-NEXT: movswl 14(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm12
-; AVX512F-NEXT: movswl 12(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm13
-; AVX512F-NEXT: movswl 8(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm14
-; AVX512F-NEXT: movswl 10(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm15
-; AVX512F-NEXT: movswl 22(%rdi), %eax
+; AVX512F-NEXT: pushq %rax
+; AVX512F-NEXT: movl (%rdi), %eax
+; AVX512F-NEXT: movl 4(%rdi), %ecx
+; AVX512F-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movl 12(%rdi), %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movl 8(%rdi), %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movl 20(%rdi), %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movl 16(%rdi), %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movl 28(%rdi), %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movl 24(%rdi), %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm8
+; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1
+; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2
+; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3
+; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4
+; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5
+; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6
+; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm7
+; AVX512F-NEXT: vpextrw $1, %xmm7, %eax
+; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm0
; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512F-NEXT: movswl 20(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512F-NEXT: movswl 16(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512F-NEXT: movswl 18(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512F-NEXT: movswl 30(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512F-NEXT: movswl 28(%rdi), %eax
+; AVX512F-NEXT: vmovd %xmm7, %eax
+; AVX512F-NEXT: cwtl
+; AVX512F-NEXT: vmovd %eax, %xmm7
+; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[2,3]
+; AVX512F-NEXT: vmovd %xmm6, %eax
+; AVX512F-NEXT: cwtl
+; AVX512F-NEXT: vmovd %eax, %xmm7
+; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0],xmm0[3]
+; AVX512F-NEXT: vpextrw $1, %xmm6, %eax
+; AVX512F-NEXT: cwtl
+; AVX512F-NEXT: vmovd %eax, %xmm6
+; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[0]
+; AVX512F-NEXT: vpextrw $1, %xmm5, %eax
+; AVX512F-NEXT: cwtl
+; AVX512F-NEXT: vmovd %eax, %xmm6
+; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512F-NEXT: vmovd %xmm5, %eax
+; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm5
; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512F-NEXT: movswl 24(%rdi), %eax
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3]
+; AVX512F-NEXT: vmovd %xmm4, %eax
+; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm6
; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX512F-NEXT: movswl 26(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm7
-; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0],xmm5[3]
+; AVX512F-NEXT: vpextrw $1, %xmm4, %eax
+; AVX512F-NEXT: cwtl
+; AVX512F-NEXT: vmovd %eax, %xmm4
+; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4
; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm4
+; AVX512F-NEXT: vpextrw $1, %xmm3, %eax
+; AVX512F-NEXT: cwtl
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT: vmovd %xmm3, %eax
+; AVX512F-NEXT: cwtl
+; AVX512F-NEXT: vmovd %eax, %xmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[2,3]
+; AVX512F-NEXT: vmovd %xmm2, %eax
+; AVX512F-NEXT: cwtl
+; AVX512F-NEXT: vmovd %eax, %xmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; AVX512F-NEXT: vpextrw $1, %xmm2, %eax
+; AVX512F-NEXT: cwtl
+; AVX512F-NEXT: vmovd %eax, %xmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
+; AVX512F-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512F-NEXT: cwtl
+; AVX512F-NEXT: vmovd %eax, %xmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: cwtl
+; AVX512F-NEXT: vmovd %eax, %xmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; AVX512F-NEXT: vmovd %xmm8, %eax
+; AVX512F-NEXT: cwtl
+; AVX512F-NEXT: vmovd %eax, %xmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX512F-NEXT: vpextrw $1, %xmm8, %eax
+; AVX512F-NEXT: cwtl
+; AVX512F-NEXT: vmovd %eax, %xmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512F-NEXT: popq %rax
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_cvt_16i16_to_16f32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: movswl 6(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm8
-; AVX512VL-NEXT: movswl 4(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm9
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm10
-; AVX512VL-NEXT: movswl 2(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm11
-; AVX512VL-NEXT: movswl 14(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm4
-; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm12
-; AVX512VL-NEXT: movswl 12(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm5
-; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm13
-; AVX512VL-NEXT: movswl 8(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm6
-; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm14
-; AVX512VL-NEXT: movswl 10(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm7
-; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm15
-; AVX512VL-NEXT: movswl 22(%rdi), %eax
+; AVX512VL-NEXT: pushq %rax
+; AVX512VL-NEXT: movl (%rdi), %eax
+; AVX512VL-NEXT: movl 4(%rdi), %ecx
+; AVX512VL-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movl 12(%rdi), %eax
+; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movl 8(%rdi), %eax
+; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movl 20(%rdi), %eax
+; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movl 16(%rdi), %eax
+; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movl 28(%rdi), %eax
+; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movl 24(%rdi), %eax
+; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm8
+; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1
+; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2
+; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3
+; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4
+; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5
+; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6
+; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm7
+; AVX512VL-NEXT: vpextrw $1, %xmm7, %eax
+; AVX512VL-NEXT: cwtl
; AVX512VL-NEXT: vmovd %eax, %xmm0
; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: movswl 20(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: movswl 16(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: movswl 18(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: movswl 30(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm4
-; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512VL-NEXT: movswl 28(%rdi), %eax
+; AVX512VL-NEXT: vmovd %xmm7, %eax
+; AVX512VL-NEXT: cwtl
+; AVX512VL-NEXT: vmovd %eax, %xmm7
+; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[2,3]
+; AVX512VL-NEXT: vmovd %xmm6, %eax
+; AVX512VL-NEXT: cwtl
+; AVX512VL-NEXT: vmovd %eax, %xmm7
+; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0],xmm0[3]
+; AVX512VL-NEXT: vpextrw $1, %xmm6, %eax
+; AVX512VL-NEXT: cwtl
+; AVX512VL-NEXT: vmovd %eax, %xmm6
+; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[0]
+; AVX512VL-NEXT: vpextrw $1, %xmm5, %eax
+; AVX512VL-NEXT: cwtl
+; AVX512VL-NEXT: vmovd %eax, %xmm6
+; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512VL-NEXT: vmovd %xmm5, %eax
+; AVX512VL-NEXT: cwtl
; AVX512VL-NEXT: vmovd %eax, %xmm5
; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512VL-NEXT: movswl 24(%rdi), %eax
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3]
+; AVX512VL-NEXT: vmovd %xmm4, %eax
+; AVX512VL-NEXT: cwtl
; AVX512VL-NEXT: vmovd %eax, %xmm6
; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX512VL-NEXT: movswl 26(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm7
-; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0],xmm5[3]
+; AVX512VL-NEXT: vpextrw $1, %xmm4, %eax
+; AVX512VL-NEXT: cwtl
+; AVX512VL-NEXT: vmovd %eax, %xmm4
+; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX512VL-NEXT: vpextrw $1, %xmm3, %eax
+; AVX512VL-NEXT: cwtl
+; AVX512VL-NEXT: vmovd %eax, %xmm4
+; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512VL-NEXT: vmovd %xmm3, %eax
+; AVX512VL-NEXT: cwtl
+; AVX512VL-NEXT: vmovd %eax, %xmm3
+; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
+; AVX512VL-NEXT: vmovd %xmm2, %eax
+; AVX512VL-NEXT: cwtl
+; AVX512VL-NEXT: vmovd %eax, %xmm4
+; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
+; AVX512VL-NEXT: vpextrw $1, %xmm2, %eax
+; AVX512VL-NEXT: cwtl
+; AVX512VL-NEXT: vmovd %eax, %xmm2
+; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0]
+; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT: cwtl
+; AVX512VL-NEXT: vmovd %eax, %xmm3
+; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512VL-NEXT: vmovd %xmm1, %eax
+; AVX512VL-NEXT: cwtl
+; AVX512VL-NEXT: vmovd %eax, %xmm1
+; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2,3]
+; AVX512VL-NEXT: vmovd %xmm8, %eax
+; AVX512VL-NEXT: cwtl
+; AVX512VL-NEXT: vmovd %eax, %xmm3
+; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; AVX512VL-NEXT: vpextrw $1, %xmm8, %eax
+; AVX512VL-NEXT: cwtl
+; AVX512VL-NEXT: vmovd %eax, %xmm3
+; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
+; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512VL-NEXT: popq %rax
; AVX512VL-NEXT: retq
%1 = load <16 x i16>, <16 x i16>* %a0
%2 = bitcast <16 x i16> %1 to <16 x half>
@@ -936,14 +1021,14 @@ define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
; ALL-NEXT: vmovd %xmm0, %eax
; ALL-NEXT: movswl %ax, %ecx
; ALL-NEXT: shrl $16, %eax
-; ALL-NEXT: cwtl
-; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vmovd %ecx, %xmm0
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
-; ALL-NEXT: vmovd %ecx, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm1
; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; ALL-NEXT: retq
%1 = bitcast <2 x i16> %a0 to <2 x half>
%2 = fpext <2 x half> %1 to <2 x double>
@@ -955,29 +1040,30 @@ define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind {
; ALL: # %bb.0:
; ALL-NEXT: vmovq %xmm0, %rax
; ALL-NEXT: movq %rax, %rcx
-; ALL-NEXT: movl %eax, %edx
+; ALL-NEXT: movq %rax, %rdx
; ALL-NEXT: movswl %ax, %esi
-; ALL-NEXT: shrq $48, %rax
+; ALL-NEXT: # kill: def $eax killed $eax killed $rax
+; ALL-NEXT: shrl $16, %eax
; ALL-NEXT: shrq $32, %rcx
-; ALL-NEXT: shrl $16, %edx
+; ALL-NEXT: shrq $48, %rdx
; ALL-NEXT: movswl %dx, %edx
; ALL-NEXT: vmovd %edx, %xmm0
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: movswl %cx, %ecx
+; ALL-NEXT: vmovd %ecx, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; ALL-NEXT: vmovd %esi, %xmm1
; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
-; ALL-NEXT: movswl %cx, %ecx
-; ALL-NEXT: vmovd %ecx, %xmm2
-; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; ALL-NEXT: cwtl
-; ALL-NEXT: vmovd %eax, %xmm3
-; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
-; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; ALL-NEXT: retq
%1 = bitcast <4 x i16> %a0 to <4 x half>
%2 = fpext <4 x half> %1 to <4 x double>
@@ -990,14 +1076,14 @@ define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind {
; ALL-NEXT: vmovd %xmm0, %eax
; ALL-NEXT: movswl %ax, %ecx
; ALL-NEXT: shrl $16, %eax
-; ALL-NEXT: cwtl
-; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vmovd %ecx, %xmm0
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
-; ALL-NEXT: vmovd %ecx, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm1
; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; ALL-NEXT: retq
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
%2 = bitcast <2 x i16> %1 to <2 x half>
@@ -1010,29 +1096,30 @@ define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind {
; ALL: # %bb.0:
; ALL-NEXT: vmovq %xmm0, %rax
; ALL-NEXT: movq %rax, %rcx
-; ALL-NEXT: movl %eax, %edx
+; ALL-NEXT: movq %rax, %rdx
; ALL-NEXT: movswl %ax, %esi
-; ALL-NEXT: shrq $48, %rax
+; ALL-NEXT: # kill: def $eax killed $eax killed $rax
+; ALL-NEXT: shrl $16, %eax
; ALL-NEXT: shrq $32, %rcx
-; ALL-NEXT: shrl $16, %edx
+; ALL-NEXT: shrq $48, %rdx
; ALL-NEXT: movswl %dx, %edx
; ALL-NEXT: vmovd %edx, %xmm0
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: movswl %cx, %ecx
+; ALL-NEXT: vmovd %ecx, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; ALL-NEXT: vmovd %esi, %xmm1
; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
-; ALL-NEXT: movswl %cx, %ecx
-; ALL-NEXT: vmovd %ecx, %xmm2
-; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; ALL-NEXT: cwtl
-; ALL-NEXT: vmovd %eax, %xmm3
-; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
-; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; ALL-NEXT: retq
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = bitcast <4 x i16> %1 to <4 x half>
@@ -1043,165 +1130,171 @@ define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind {
define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX1-LABEL: cvt_8i16_to_8f64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovq %xmm0, %rdx
+; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
; AVX1-NEXT: movq %rdx, %r9
-; AVX1-NEXT: movl %edx, %r10d
+; AVX1-NEXT: movq %rdx, %r10
; AVX1-NEXT: movswl %dx, %r8d
-; AVX1-NEXT: shrq $48, %rdx
+; AVX1-NEXT: # kill: def $edx killed $edx killed $rdx
+; AVX1-NEXT: shrl $16, %edx
; AVX1-NEXT: shrq $32, %r9
-; AVX1-NEXT: shrl $16, %r10d
-; AVX1-NEXT: vpextrq $1, %xmm0, %rdi
+; AVX1-NEXT: shrq $48, %r10
+; AVX1-NEXT: vmovq %xmm0, %rdi
; AVX1-NEXT: movq %rdi, %rsi
-; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: movq %rdi, %rax
; AVX1-NEXT: movswl %di, %ecx
-; AVX1-NEXT: shrq $48, %rdi
+; AVX1-NEXT: # kill: def $edi killed $edi killed $rdi
+; AVX1-NEXT: shrl $16, %edi
; AVX1-NEXT: shrq $32, %rsi
-; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: shrq $48, %rax
; AVX1-NEXT: cwtl
; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1
-; AVX1-NEXT: vmovd %ecx, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX1-NEXT: movswl %si, %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovd %ecx, %xmm1
+; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: movswl %di, %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm4
+; AVX1-NEXT: vmovd %eax, %xmm2
+; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: movswl %r10w, %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: vmovd %r8d, %xmm5
-; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: movswl %r9w, %eax
-; AVX1-NEXT: vmovd %eax, %xmm6
-; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX1-NEXT: movswl %dx, %eax
-; AVX1-NEXT: vmovd %eax, %xmm7
-; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX1-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
-; AVX1-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX1-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; AVX1-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX1-NEXT: vmovd %eax, %xmm2
+; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vmovd %r8d, %xmm2
+; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: movswl %dx, %eax
+; AVX1-NEXT: vmovd %eax, %xmm3
+; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_8i16_to_8f64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovq %xmm0, %rdx
+; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
; AVX2-NEXT: movq %rdx, %r9
-; AVX2-NEXT: movl %edx, %r10d
+; AVX2-NEXT: movq %rdx, %r10
; AVX2-NEXT: movswl %dx, %r8d
-; AVX2-NEXT: shrq $48, %rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx
+; AVX2-NEXT: shrl $16, %edx
; AVX2-NEXT: shrq $32, %r9
-; AVX2-NEXT: shrl $16, %r10d
-; AVX2-NEXT: vpextrq $1, %xmm0, %rdi
+; AVX2-NEXT: shrq $48, %r10
+; AVX2-NEXT: vmovq %xmm0, %rdi
; AVX2-NEXT: movq %rdi, %rsi
-; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: movq %rdi, %rax
; AVX2-NEXT: movswl %di, %ecx
-; AVX2-NEXT: shrq $48, %rdi
+; AVX2-NEXT: # kill: def $edi killed $edi killed $rdi
+; AVX2-NEXT: shrl $16, %edi
; AVX2-NEXT: shrq $32, %rsi
-; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: shrq $48, %rax
; AVX2-NEXT: cwtl
; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1
-; AVX2-NEXT: vmovd %ecx, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX2-NEXT: movswl %si, %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vmovd %ecx, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: movswl %di, %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm4
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: movswl %r10w, %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: vmovd %r8d, %xmm5
-; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: movswl %r9w, %eax
-; AVX2-NEXT: vmovd %eax, %xmm6
-; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX2-NEXT: movswl %dx, %eax
-; AVX2-NEXT: vmovd %eax, %xmm7
-; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX2-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
-; AVX2-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX2-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
-; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; AVX2-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: vmovd %r8d, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: movswl %dx, %eax
+; AVX2-NEXT: vmovd %eax, %xmm3
+; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: cvt_8i16_to_8f64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512-NEXT: vmovq %xmm0, %rdx
; AVX512-NEXT: movq %rdx, %r9
-; AVX512-NEXT: movl %edx, %r10d
+; AVX512-NEXT: movq %rdx, %r10
; AVX512-NEXT: movswl %dx, %r8d
-; AVX512-NEXT: shrq $48, %rdx
+; AVX512-NEXT: # kill: def $edx killed $edx killed $rdx
+; AVX512-NEXT: shrl $16, %edx
; AVX512-NEXT: shrq $32, %r9
-; AVX512-NEXT: shrl $16, %r10d
-; AVX512-NEXT: vmovq %xmm0, %rdi
+; AVX512-NEXT: shrq $48, %r10
+; AVX512-NEXT: vpextrq $1, %xmm0, %rdi
; AVX512-NEXT: movq %rdi, %rsi
-; AVX512-NEXT: movl %edi, %eax
+; AVX512-NEXT: movq %rdi, %rax
; AVX512-NEXT: movswl %di, %ecx
-; AVX512-NEXT: shrq $48, %rdi
+; AVX512-NEXT: # kill: def $edi killed $edi killed $rdi
+; AVX512-NEXT: shrl $16, %edi
; AVX512-NEXT: shrq $32, %rsi
-; AVX512-NEXT: shrl $16, %eax
+; AVX512-NEXT: shrq $48, %rax
; AVX512-NEXT: cwtl
; AVX512-NEXT: vmovd %eax, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: movswl %si, %eax
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512-NEXT: vmovd %ecx, %xmm1
; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: movswl %si, %eax
+; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: movswl %di, %eax
; AVX512-NEXT: vmovd %eax, %xmm2
; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: movswl %di, %eax
-; AVX512-NEXT: vmovd %eax, %xmm3
-; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX512-NEXT: movswl %r10w, %eax
-; AVX512-NEXT: vmovd %eax, %xmm4
-; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512-NEXT: vmovd %r8d, %xmm5
-; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512-NEXT: movswl %r9w, %eax
-; AVX512-NEXT: vmovd %eax, %xmm6
-; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT: vmovd %r8d, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX512-NEXT: movswl %dx, %eax
-; AVX512-NEXT: vmovd %eax, %xmm7
-; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
-; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
-; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512-NEXT: retq
%1 = bitcast <8 x i16> %a0 to <8 x half>
%2 = fpext <8 x half> %1 to <8 x double>
@@ -1229,15 +1322,15 @@ define double @load_cvt_i16_to_f64(i16* %a0) nounwind {
define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind {
; ALL-LABEL: load_cvt_2i16_to_2f64:
; ALL: # %bb.0:
-; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: movswl 2(%rdi), %eax
; ALL-NEXT: vmovd %eax, %xmm0
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
-; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: movswl (%rdi), %eax
; ALL-NEXT: vmovd %eax, %xmm1
; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; ALL-NEXT: retq
%1 = load <2 x i16>, <2 x i16>* %a0
%2 = bitcast <2 x i16> %1 to <2 x half>
@@ -1248,25 +1341,25 @@ define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind {
define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind {
; ALL-LABEL: load_cvt_4i16_to_4f64:
; ALL: # %bb.0:
-; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: movswl 6(%rdi), %eax
; ALL-NEXT: vmovd %eax, %xmm0
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: movswl 4(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; ALL-NEXT: movswl 2(%rdi), %eax
; ALL-NEXT: vmovd %eax, %xmm1
; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
-; ALL-NEXT: movswl 4(%rdi), %eax
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: movswl (%rdi), %eax
; ALL-NEXT: vmovd %eax, %xmm2
; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
-; ALL-NEXT: movswl 6(%rdi), %eax
-; ALL-NEXT: vmovd %eax, %xmm3
-; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
-; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; ALL-NEXT: retq
%1 = load <4 x i16>, <4 x i16>* %a0
%2 = bitcast <4 x i16> %1 to <4 x half>
@@ -1279,29 +1372,30 @@ define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind {
; ALL: # %bb.0:
; ALL-NEXT: movq (%rdi), %rax
; ALL-NEXT: movq %rax, %rcx
-; ALL-NEXT: movl %eax, %edx
+; ALL-NEXT: movq %rax, %rdx
; ALL-NEXT: movswl %ax, %esi
-; ALL-NEXT: shrq $48, %rax
+; ALL-NEXT: # kill: def $eax killed $eax killed $rax
+; ALL-NEXT: shrl $16, %eax
; ALL-NEXT: shrq $32, %rcx
-; ALL-NEXT: shrl $16, %edx
+; ALL-NEXT: shrq $48, %rdx
; ALL-NEXT: movswl %dx, %edx
; ALL-NEXT: vmovd %edx, %xmm0
; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: movswl %cx, %ecx
+; ALL-NEXT: vmovd %ecx, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; ALL-NEXT: vmovd %esi, %xmm1
; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
-; ALL-NEXT: movswl %cx, %ecx
-; ALL-NEXT: vmovd %ecx, %xmm2
-; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; ALL-NEXT: cwtl
-; ALL-NEXT: vmovd %eax, %xmm3
-; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
-; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; ALL-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1313,129 +1407,129 @@ define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind {
define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind {
; AVX1-LABEL: load_cvt_8i16_to_8f64:
; AVX1: # %bb.0:
-; AVX1-NEXT: movswl 8(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1
-; AVX1-NEXT: movswl 10(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2
-; AVX1-NEXT: movswl 12(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3
-; AVX1-NEXT: movswl 14(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm4
-; AVX1-NEXT: movswl (%rdi), %eax
+; AVX1-NEXT: movswl 6(%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: movswl 2(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm5
-; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX1-NEXT: movswl 4(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm6
-; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX1-NEXT: movswl 6(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm7
-; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX1-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
-; AVX1-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX1-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; AVX1-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX1-NEXT: movswl 4(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: movswl 2(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: movswl (%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm2
+; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: movswl 14(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: movswl 12(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm2
+; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: movswl 10(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm2
+; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: movswl 8(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm3
+; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_cvt_8i16_to_8f64:
; AVX2: # %bb.0:
-; AVX2-NEXT: movswl 8(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1
-; AVX2-NEXT: movswl 10(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2
-; AVX2-NEXT: movswl 12(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3
-; AVX2-NEXT: movswl 14(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm4
-; AVX2-NEXT: movswl (%rdi), %eax
+; AVX2-NEXT: movswl 6(%rdi), %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: movswl 2(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm5
-; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX2-NEXT: movswl 4(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm6
-; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX2-NEXT: movswl 6(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm7
-; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX2-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
-; AVX2-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX2-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; AVX2-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX2-NEXT: movswl 4(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: movswl 2(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: movswl (%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movswl 14(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: movswl 12(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: movswl 10(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: movswl 8(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm3
+; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_cvt_8i16_to_8f64:
; AVX512: # %bb.0:
-; AVX512-NEXT: movswl (%rdi), %eax
+; AVX512-NEXT: movswl 14(%rdi), %eax
; AVX512-NEXT: vmovd %eax, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: movswl 2(%rdi), %eax
+; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: movswl 12(%rdi), %eax
; AVX512-NEXT: vmovd %eax, %xmm1
; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: movswl 4(%rdi), %eax
+; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: movswl 10(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: movswl 8(%rdi), %eax
; AVX512-NEXT: vmovd %eax, %xmm2
; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX512-NEXT: movswl 6(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: movswl 4(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT: movswl 2(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: movswl (%rdi), %eax
; AVX512-NEXT: vmovd %eax, %xmm3
; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512-NEXT: movswl 8(%rdi), %eax
-; AVX512-NEXT: vmovd %eax, %xmm4
-; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512-NEXT: movswl 10(%rdi), %eax
-; AVX512-NEXT: vmovd %eax, %xmm5
-; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512-NEXT: movswl 12(%rdi), %eax
-; AVX512-NEXT: vmovd %eax, %xmm6
-; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX512-NEXT: movswl 14(%rdi), %eax
-; AVX512-NEXT: vmovd %eax, %xmm7
-; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
-; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
-; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
%2 = bitcast <8 x i16> %1 to <8 x half>
More information about the llvm-commits
mailing list