[llvm] r237885 - [X86][SSE] Improve support for 128-bit vector sign extension
Jeroen Ketema
jeroen at codeplay.com
Fri Jun 19 02:21:23 PDT 2015
This commit breaks compilation on ivybridge with Debug builds
> llc.exe test.ll -mcpu ivybridge
Assertion failed: VT.getVectorElementType() ==
N1.getValueType().getVectorElementType() && "Extract subvector VTs must
have the same element type!", file
...\llvm\lib\CodeGen\SelectionDAG\SelectionDAG.cpp, line 3549
where test.ll is:
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
%Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float> addrspace(1)* }>
; Function Attrs: nounwind
define void @foo(%Arguments* nocapture readonly %args) #0 {
body:
%0 = load %Arguments, %Arguments* %args, align 1
%1 = extractvalue %Arguments %0, 1
%2 = extractvalue %Arguments %0, 2
%3 = sitofp <8 x i16> %1 to <8 x float>
store <8 x float> %3, <8 x float> addrspace(1)* %2, align 32
ret void
}
attributes #0 = { nounwind }
!llvm.ident = !{!0, !0, !0}
!0 = !{!"clang version 3.7.0 "}
On 21/05/2015 11:05, Simon Pilgrim wrote:
> Author: rksimon
> Date: Thu May 21 05:05:03 2015
> New Revision: 237885
>
> URL: http://llvm.org/viewvc/llvm-project?rev=237885&view=rev
> Log:
> [X86][SSE] Improve support for 128-bit vector sign extension
>
> This patch improves support for sign extension of the lower lanes of vectors of integers by making use of the SSE41 pmovsx* sign extension instructions where possible, and optimizing the sign extension by shifts on pre-SSE41 targets (avoiding the use of i64 arithmetic shifts which require scalarization).
>
> It converts SIGN_EXTEND nodes to SIGN_EXTEND_VECTOR_INREG where necessary, that more closely matches the pmovsx* instruction than the default approach of using SIGN_EXTEND_INREG which splits the operation (into an ANY_EXTEND lowered to a shuffle followed by shifts) making instruction matching difficult during lowering. Necessary support for SIGN_EXTEND_VECTOR_INREG has been added to the DAGCombiner.
>
> Differential Revision: http://reviews.llvm.org/D9848
>
> Modified:
> llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
> llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
> llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> llvm/trunk/test/CodeGen/X86/pr15267.ll
> llvm/trunk/test/CodeGen/X86/vec_cast2.ll
> llvm/trunk/test/CodeGen/X86/vector-sext.ll
> llvm/trunk/test/CodeGen/X86/vselect-avx.ll
>
> Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=237885&r1=237884&r2=237885&view=diff
> ==============================================================================
> --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
> +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Thu May 21 05:05:03 2015
> @@ -268,6 +268,7 @@ namespace {
> SDValue visitZERO_EXTEND(SDNode *N);
> SDValue visitANY_EXTEND(SDNode *N);
> SDValue visitSIGN_EXTEND_INREG(SDNode *N);
> + SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N);
> SDValue visitTRUNCATE(SDNode *N);
> SDValue visitBITCAST(SDNode *N);
> SDValue visitBUILD_PAIR(SDNode *N);
> @@ -1347,6 +1348,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
> case ISD::ZERO_EXTEND: return visitZERO_EXTEND(N);
> case ISD::ANY_EXTEND: return visitANY_EXTEND(N);
> case ISD::SIGN_EXTEND_INREG: return visitSIGN_EXTEND_INREG(N);
> + case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N);
> case ISD::TRUNCATE: return visitTRUNCATE(N);
> case ISD::BITCAST: return visitBITCAST(N);
> case ISD::BUILD_PAIR: return visitBUILD_PAIR(N);
> @@ -5541,7 +5543,8 @@ static SDNode *tryToFoldExtendOfConstant
> EVT VT = N->getValueType(0);
>
> assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND ||
> - Opcode == ISD::ANY_EXTEND) && "Expected EXTEND dag node in input!");
> + Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG)
> + && "Expected EXTEND dag node in input!");
>
> // fold (sext c1) -> c1
> // fold (zext c1) -> c1
> @@ -5563,7 +5566,7 @@ static SDNode *tryToFoldExtendOfConstant
> unsigned EVTBits = N0->getValueType(0).getScalarType().getSizeInBits();
> unsigned ShAmt = VTBits - EVTBits;
> SmallVector<SDValue, 8> Elts;
> - unsigned NumElts = N0->getNumOperands();
> + unsigned NumElts = VT.getVectorNumElements();
> SDLoc DL(N);
>
> for (unsigned i=0; i != NumElts; ++i) {
> @@ -5576,7 +5579,7 @@ static SDNode *tryToFoldExtendOfConstant
> SDLoc DL(Op);
> ConstantSDNode *CurrentND = cast<ConstantSDNode>(Op);
> const APInt &C = APInt(VTBits, CurrentND->getAPIntValue().getZExtValue());
> - if (Opcode == ISD::SIGN_EXTEND)
> + if (Opcode == ISD::SIGN_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG)
> Elts.push_back(DAG.getConstant(C.shl(ShAmt).ashr(ShAmt).getZExtValue(),
> DL, SVT));
> else
> @@ -6804,6 +6807,20 @@ SDValue DAGCombiner::visitSIGN_EXTEND_IN
>
> return SDValue();
> }
> +
> +SDValue DAGCombiner::visitSIGN_EXTEND_VECTOR_INREG(SDNode *N) {
> + SDValue N0 = N->getOperand(0);
> + EVT VT = N->getValueType(0);
> +
> + if (N0.getOpcode() == ISD::UNDEF)
> + return DAG.getUNDEF(VT);
> +
> + if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
> + LegalOperations))
> + return SDValue(Res, 0);
> +
> + return SDValue();
> +}
>
> SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
> SDValue N0 = N->getOperand(0);
>
> Modified: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp?rev=237885&r1=237884&r2=237885&view=diff
> ==============================================================================
> --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (original)
> +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp Thu May 21 05:05:03 2015
> @@ -3429,12 +3429,35 @@ SDValue SelectionDAG::getNode(unsigned O
> assert(EVT.bitsLE(VT) && "Not extending!");
> if (EVT == VT) return N1; // Not actually extending
>
> + auto SignExtendInReg = [&](APInt Val) {
> + unsigned FromBits = EVT.getScalarType().getSizeInBits();
> + Val <<= Val.getBitWidth() - FromBits;
> + Val = Val.ashr(Val.getBitWidth() - FromBits);
> + return getConstant(Val, DL, VT.getScalarType());
> + };
> +
> if (N1C) {
> APInt Val = N1C->getAPIntValue();
> - unsigned FromBits = EVT.getScalarType().getSizeInBits();
> - Val <<= Val.getBitWidth()-FromBits;
> - Val = Val.ashr(Val.getBitWidth()-FromBits);
> - return getConstant(Val, DL, VT);
> + return SignExtendInReg(Val);
> + }
> + if (ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) {
> + SmallVector<SDValue, 8> Ops;
> + for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
> + SDValue Op = N1.getOperand(i);
> + if (Op.getValueType() != VT.getScalarType()) break;
> + if (Op.getOpcode() == ISD::UNDEF) {
> + Ops.push_back(Op);
> + continue;
> + }
> + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getNode())) {
> + APInt Val = C->getAPIntValue();
> + Ops.push_back(SignExtendInReg(Val));
> + continue;
> + }
> + break;
> + }
> + if (Ops.size() == VT.getVectorNumElements())
> + return getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
> }
> break;
> }
>
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=237885&r1=237884&r2=237885&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu May 21 05:05:03 2015
> @@ -1004,6 +1004,10 @@ X86TargetLowering::X86TargetLowering(con
> }
>
> if (Subtarget->hasSSE2()) {
> + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
> + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
> + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
> +
> setOperationAction(ISD::SRL, MVT::v8i16, Custom);
> setOperationAction(ISD::SRL, MVT::v16i8, Custom);
>
> @@ -13914,6 +13918,63 @@ static SDValue LowerSIGN_EXTEND_AVX512(S
> return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
> }
>
> +static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
> + const X86Subtarget *Subtarget,
> + SelectionDAG &DAG) {
> + SDValue In = Op->getOperand(0);
> + MVT VT = Op->getSimpleValueType(0);
> + MVT InVT = In.getSimpleValueType();
> + assert(VT.getSizeInBits() == InVT.getSizeInBits());
> +
> + MVT SVT = VT.getScalarType();
> + MVT InSVT = InVT.getScalarType();
> + assert(SVT.getScalarSizeInBits() > InSVT.getScalarSizeInBits());
> +
> + if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
> + return SDValue();
> + if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
> + return SDValue();
> +
> + SDLoc dl(Op);
> +
> + // SSE41 targets can use the pmovsx* instructions directly.
> + if (Subtarget->hasSSE41())
> + return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
> +
> + // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
> + SDValue Curr = In;
> + MVT CurrVT = InVT;
> +
> + // As SRAI is only available on i16/i32 types, we expand only up to i32
> + // and handle i64 separately.
> + while (CurrVT != VT && CurrVT.getScalarType() != MVT::i32) {
> + Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
> + MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
> + CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
> + Curr = DAG.getNode(ISD::BITCAST, dl, CurrVT, Curr);
> + }
> +
> + SDValue SignExt = Curr;
> + if (CurrVT != InVT) {
> + unsigned SignExtShift =
> + CurrVT.getScalarSizeInBits() - InSVT.getScalarSizeInBits();
> + SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
> + DAG.getConstant(SignExtShift, dl, MVT::i8));
> + }
> +
> + if (CurrVT == VT)
> + return SignExt;
> +
> + if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
> + SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
> + DAG.getConstant(31, dl, MVT::i8));
> + SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
> + return DAG.getNode(ISD::BITCAST, dl, VT, Ext);
> + }
> +
> + return SDValue();
> +}
> +
> static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
> SelectionDAG &DAG) {
> MVT VT = Op->getSimpleValueType(0);
> @@ -17580,6 +17641,8 @@ SDValue X86TargetLowering::LowerOperatio
> case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
> case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
> case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
> + case ISD::SIGN_EXTEND_VECTOR_INREG:
> + return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG);
> case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
> case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
> case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
> @@ -23683,16 +23746,19 @@ static SDValue PerformSExtCombine(SDNode
> const X86Subtarget *Subtarget) {
> SDValue N0 = N->getOperand(0);
> EVT VT = N->getValueType(0);
> - SDLoc dl(N);
> + EVT SVT = VT.getScalarType();
> + EVT InVT = N0->getValueType(0);
> + EVT InSVT = InVT.getScalarType();
> + SDLoc DL(N);
>
> // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
> // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
> // This exposes the sext to the sdivrem lowering, so that it directly extends
> // from AH (which we otherwise need to do contortions to access).
> if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
> - N0.getValueType() == MVT::i8 && VT == MVT::i32) {
> + InVT == MVT::i8 && VT == MVT::i32) {
> SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
> - SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,
> + SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, DL, NodeTys,
> N0.getOperand(0), N0.getOperand(1));
> DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
> return R.getValue(1);
> @@ -23700,14 +23766,57 @@ static SDValue PerformSExtCombine(SDNode
>
> if (!DCI.isBeforeLegalizeOps()) {
> if (N0.getValueType() == MVT::i1) {
> - SDValue Zero = DAG.getConstant(0, dl, VT);
> + SDValue Zero = DAG.getConstant(0, DL, VT);
> SDValue AllOnes =
> - DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, VT);
> - return DAG.getNode(ISD::SELECT, dl, VT, N0, AllOnes, Zero);
> + DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
> + return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
> }
> return SDValue();
> }
>
> + if (VT.isVector()) {
> + auto ExtendToVec128 = [&DAG](SDLoc DL, SDValue N) {
> + EVT InVT = N->getValueType(0);
> + EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
> + 128 / InVT.getScalarSizeInBits());
> + SmallVector<SDValue, 8> Opnds(128 / InVT.getSizeInBits(),
> + DAG.getUNDEF(InVT));
> + Opnds[0] = N;
> + return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
> + };
> +
> + // If target-size is 128-bits, then convert to ISD::SIGN_EXTEND_VECTOR_INREG
> + // which ensures lowering to X86ISD::VSEXT (pmovsx*).
> + if (VT.getSizeInBits() == 128 &&
> + (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
> + (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
> + SDValue ExOp = ExtendToVec128(DL, N0);
> + return DAG.getSignExtendVectorInReg(ExOp, DL, VT);
> + }
> +
> + // On pre-AVX2 targets, split into 128-bit nodes of
> + // ISD::SIGN_EXTEND_VECTOR_INREG.
> + if (!Subtarget->hasInt256() && !(VT.getSizeInBits() % 128) &&
> + (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
> + (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
> + unsigned NumVecs = VT.getSizeInBits() / 128;
> + unsigned NumSubElts = 128 / SVT.getSizeInBits();
> + EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
> + EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
> +
> + SmallVector<SDValue, 8> Opnds;
> + for (unsigned i = 0, Offset = 0; i != NumVecs;
> + ++i, Offset += NumSubElts) {
> + SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
> + DAG.getIntPtrConstant(Offset, DL));
> + SrcVec = ExtendToVec128(DL, SrcVec);
> + SrcVec = DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT);
> + Opnds.push_back(SrcVec);
> + }
> + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
> + }
> + }
> +
> if (!Subtarget->hasFp256())
> return SDValue();
>
>
> Modified: llvm/trunk/test/CodeGen/X86/pr15267.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr15267.ll?rev=237885&r1=237884&r2=237885&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/pr15267.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/pr15267.ll Thu May 21 05:05:03 2015
> @@ -50,20 +50,22 @@ define <4 x i64> @test3(<4 x i1>* %in) n
> ; CHECK: movq
> ; CHECK: shlq
> ; CHECK: sarq
> -; CHECK: vmovq
> ; CHECK: movq
> ; CHECK: shlq
> ; CHECK: sarq
> -; CHECK: vmovq
> -; CHECK: vpunpcklqdq
> +; CHECK: vmovd
> +; CHECK: vpinsrd
> ; CHECK: movq
> ; CHECK: shlq
> ; CHECK: sarq
> -; CHECK: vmovq
> +; CHECK: vpinsrd
> ; CHECK: shlq
> ; CHECK: sarq
> -; CHECK: vmovq
> -; CHECK: vpunpcklqdq
> +; CHECK: vpinsrd
> +; CHECK: vpmovsxdq
> +; CHECK: vmovd
> +; CHECK: vpinsrd
> +; CHECK: vpmovsxdq
> ; CHECK: vinsertf128
> ; CHECK: ret
>
>
> Modified: llvm/trunk/test/CodeGen/X86/vec_cast2.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_cast2.ll?rev=237885&r1=237884&r2=237885&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/vec_cast2.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/vec_cast2.ll Thu May 21 05:05:03 2015
> @@ -16,13 +16,9 @@ define <8 x float> @foo1_8(<8 x i8> %src
> ;
> ; CHECK-WIDE-LABEL: foo1_8:
> ; CHECK-WIDE: ## BB#0:
> -; CHECK-WIDE-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
> -; CHECK-WIDE-NEXT: vpslld $24, %xmm1, %xmm1
> -; CHECK-WIDE-NEXT: vpsrad $24, %xmm1, %xmm1
> -; CHECK-WIDE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
> -; CHECK-WIDE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
> -; CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0
> -; CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0
> +; CHECK-WIDE-NEXT: vpmovsxbd %xmm0, %xmm1
> +; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
> +; CHECK-WIDE-NEXT: vpmovsxbd %xmm0, %xmm0
> ; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
> ; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0
> ; CHECK-WIDE-NEXT: retl
> @@ -40,9 +36,7 @@ define <4 x float> @foo1_4(<4 x i8> %src
> ;
> ; CHECK-WIDE-LABEL: foo1_4:
> ; CHECK-WIDE: ## BB#0:
> -; CHECK-WIDE-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
> -; CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0
> -; CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0
> +; CHECK-WIDE-NEXT: vpmovsxbd %xmm0, %xmm0
> ; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0
> ; CHECK-WIDE-NEXT: retl
> %res = sitofp <4 x i8> %src to <4 x float>
>
> Modified: llvm/trunk/test/CodeGen/X86/vector-sext.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-sext.ll?rev=237885&r1=237884&r2=237885&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/vector-sext.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/vector-sext.ll Thu May 21 05:05:03 2015
> @@ -10,37 +10,30 @@
> define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
> ; SSE2-LABEL: sext_8i16_to_8i32:
> ; SSE2: # BB#0: # %entry
> -; SSE2-NEXT: movdqa %xmm0, %xmm1
> -; SSE2-NEXT: # kill: XMM0<def> XMM1<kill>
> -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
> -; SSE2-NEXT: pslld $16, %xmm0
> -; SSE2-NEXT: psrad $16, %xmm0
> -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
> -; SSE2-NEXT: pslld $16, %xmm1
> +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
> +; SSE2-NEXT: psrad $16, %xmm2
> +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
> +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
> ; SSE2-NEXT: psrad $16, %xmm1
> +; SSE2-NEXT: movdqa %xmm2, %xmm0
> ; SSE2-NEXT: retq
> ;
> ; SSSE3-LABEL: sext_8i16_to_8i32:
> ; SSSE3: # BB#0: # %entry
> -; SSSE3-NEXT: movdqa %xmm0, %xmm1
> -; SSSE3-NEXT: # kill: XMM0<def> XMM1<kill>
> -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
> -; SSSE3-NEXT: pslld $16, %xmm0
> -; SSSE3-NEXT: psrad $16, %xmm0
> -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
> -; SSSE3-NEXT: pslld $16, %xmm1
> +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
> +; SSSE3-NEXT: psrad $16, %xmm2
> +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
> +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
> ; SSSE3-NEXT: psrad $16, %xmm1
> +; SSSE3-NEXT: movdqa %xmm2, %xmm0
> ; SSSE3-NEXT: retq
> ;
> ; SSE41-LABEL: sext_8i16_to_8i32:
> ; SSE41: # BB#0: # %entry
> -; SSE41-NEXT: movdqa %xmm0, %xmm1
> -; SSE41-NEXT: pmovzxwd %xmm1, %xmm0
> -; SSE41-NEXT: pslld $16, %xmm0
> -; SSE41-NEXT: psrad $16, %xmm0
> -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
> -; SSE41-NEXT: pslld $16, %xmm1
> -; SSE41-NEXT: psrad $16, %xmm1
> +; SSE41-NEXT: pmovsxwd %xmm0, %xmm2
> +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
> +; SSE41-NEXT: pmovsxwd %xmm0, %xmm1
> +; SSE41-NEXT: movdqa %xmm2, %xmm0
> ; SSE41-NEXT: retq
> ;
> ; AVX1-LABEL: sext_8i16_to_8i32:
> @@ -58,13 +51,10 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x
> ;
> ; X32-SSE41-LABEL: sext_8i16_to_8i32:
> ; X32-SSE41: # BB#0: # %entry
> -; X32-SSE41-NEXT: movdqa %xmm0, %xmm1
> -; X32-SSE41-NEXT: pmovzxwd %xmm1, %xmm0
> -; X32-SSE41-NEXT: pslld $16, %xmm0
> -; X32-SSE41-NEXT: psrad $16, %xmm0
> -; X32-SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
> -; X32-SSE41-NEXT: pslld $16, %xmm1
> -; X32-SSE41-NEXT: psrad $16, %xmm1
> +; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm2
> +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
> +; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm1
> +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
> ; X32-SSE41-NEXT: retl
> entry:
> %B = sext <8 x i16> %A to <8 x i32>
> @@ -74,68 +64,31 @@ entry:
> define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
> ; SSE2-LABEL: sext_4i32_to_4i64:
> ; SSE2: # BB#0: # %entry
> -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
> -; SSE2-NEXT: movd %xmm1, %rax
> -; SSE2-NEXT: cltq
> -; SSE2-NEXT: movd %rax, %xmm2
> -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
> -; SSE2-NEXT: movd %xmm1, %rax
> -; SSE2-NEXT: cltq
> -; SSE2-NEXT: movd %rax, %xmm1
> -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
> -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
> -; SSE2-NEXT: movd %xmm0, %rax
> -; SSE2-NEXT: cltq
> -; SSE2-NEXT: movd %rax, %xmm1
> -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
> -; SSE2-NEXT: movd %xmm0, %rax
> -; SSE2-NEXT: cltq
> -; SSE2-NEXT: movd %rax, %xmm0
> -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
> -; SSE2-NEXT: movdqa %xmm2, %xmm0
> +; SSE2-NEXT: movdqa %xmm0, %xmm2
> +; SSE2-NEXT: psrad $31, %xmm2
> +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
> +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
> +; SSE2-NEXT: movdqa %xmm1, %xmm2
> +; SSE2-NEXT: psrad $31, %xmm2
> +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
> ; SSE2-NEXT: retq
> ;
> ; SSSE3-LABEL: sext_4i32_to_4i64:
> ; SSSE3: # BB#0: # %entry
> -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
> -; SSSE3-NEXT: movd %xmm1, %rax
> -; SSSE3-NEXT: cltq
> -; SSSE3-NEXT: movd %rax, %xmm2
> -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
> -; SSSE3-NEXT: movd %xmm1, %rax
> -; SSSE3-NEXT: cltq
> -; SSSE3-NEXT: movd %rax, %xmm1
> -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
> -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
> -; SSSE3-NEXT: movd %xmm0, %rax
> -; SSSE3-NEXT: cltq
> -; SSSE3-NEXT: movd %rax, %xmm1
> -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
> -; SSSE3-NEXT: movd %xmm0, %rax
> -; SSSE3-NEXT: cltq
> -; SSSE3-NEXT: movd %rax, %xmm0
> -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
> -; SSSE3-NEXT: movdqa %xmm2, %xmm0
> +; SSSE3-NEXT: movdqa %xmm0, %xmm2
> +; SSSE3-NEXT: psrad $31, %xmm2
> +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
> +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
> +; SSSE3-NEXT: movdqa %xmm1, %xmm2
> +; SSSE3-NEXT: psrad $31, %xmm2
> +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
> ; SSSE3-NEXT: retq
> ;
> ; SSE41-LABEL: sext_4i32_to_4i64:
> ; SSE41: # BB#0: # %entry
> -; SSE41-NEXT: pmovzxdq %xmm0, %xmm1
> -; SSE41-NEXT: pextrq $1, %xmm1, %rax
> -; SSE41-NEXT: cltq
> -; SSE41-NEXT: movd %rax, %xmm3
> -; SSE41-NEXT: movd %xmm1, %rax
> -; SSE41-NEXT: cltq
> -; SSE41-NEXT: movd %rax, %xmm2
> -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
> -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
> -; SSE41-NEXT: pextrq $1, %xmm0, %rax
> -; SSE41-NEXT: cltq
> -; SSE41-NEXT: movd %rax, %xmm3
> -; SSE41-NEXT: movd %xmm0, %rax
> -; SSE41-NEXT: cltq
> -; SSE41-NEXT: movd %rax, %xmm1
> -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
> +; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
> +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
> +; SSE41-NEXT: pmovsxdq %xmm0, %xmm1
> ; SSE41-NEXT: movdqa %xmm2, %xmm0
> ; SSE41-NEXT: retq
> ;
> @@ -154,20 +107,9 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x
> ;
> ; X32-SSE41-LABEL: sext_4i32_to_4i64:
> ; X32-SSE41: # BB#0: # %entry
> -; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2
> -; X32-SSE41-NEXT: movd %xmm2, %eax
> -; X32-SSE41-NEXT: sarl $31, %eax
> -; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx
> -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2
> -; X32-SSE41-NEXT: sarl $31, %ecx
> -; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2
> -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
> -; X32-SSE41-NEXT: movd %xmm1, %eax
> -; X32-SSE41-NEXT: sarl $31, %eax
> -; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx
> -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1
> -; X32-SSE41-NEXT: sarl $31, %ecx
> -; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1
> +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
> +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
> +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1
> ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
> ; X32-SSE41-NEXT: retl
> entry:
> @@ -252,20 +194,26 @@ entry:
> define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) {
> ; SSE2-LABEL: load_sext_test3:
> ; SSE2: # BB#0: # %entry
> -; SSE2-NEXT: movsbq 1(%rdi), %rax
> -; SSE2-NEXT: movd %rax, %xmm1
> -; SSE2-NEXT: movsbq (%rdi), %rax
> -; SSE2-NEXT: movd %rax, %xmm0
> -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
> +; SSE2-NEXT: movzwl (%rdi), %eax
> +; SSE2-NEXT: movd %eax, %xmm0
> +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
> +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
> +; SSE2-NEXT: movdqa %xmm0, %xmm1
> +; SSE2-NEXT: psrad $31, %xmm1
> +; SSE2-NEXT: psrad $24, %xmm0
> +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
> ; SSE2-NEXT: retq
> ;
> ; SSSE3-LABEL: load_sext_test3:
> ; SSSE3: # BB#0: # %entry
> -; SSSE3-NEXT: movsbq 1(%rdi), %rax
> -; SSSE3-NEXT: movd %rax, %xmm1
> -; SSSE3-NEXT: movsbq (%rdi), %rax
> -; SSSE3-NEXT: movd %rax, %xmm0
> -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
> +; SSSE3-NEXT: movzwl (%rdi), %eax
> +; SSSE3-NEXT: movd %eax, %xmm0
> +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
> +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
> +; SSSE3-NEXT: movdqa %xmm0, %xmm1
> +; SSSE3-NEXT: psrad $31, %xmm1
> +; SSSE3-NEXT: psrad $24, %xmm0
> +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
> ; SSSE3-NEXT: retq
> ;
> ; SSE41-LABEL: load_sext_test3:
> @@ -292,20 +240,22 @@ entry:
> define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) {
> ; SSE2-LABEL: load_sext_test4:
> ; SSE2: # BB#0: # %entry
> -; SSE2-NEXT: movswq 2(%rdi), %rax
> -; SSE2-NEXT: movd %rax, %xmm1
> -; SSE2-NEXT: movswq (%rdi), %rax
> -; SSE2-NEXT: movd %rax, %xmm0
> -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
> +; SSE2-NEXT: movd (%rdi), %xmm0
> +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
> +; SSE2-NEXT: movdqa %xmm0, %xmm1
> +; SSE2-NEXT: psrad $31, %xmm1
> +; SSE2-NEXT: psrad $16, %xmm0
> +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
> ; SSE2-NEXT: retq
> ;
> ; SSSE3-LABEL: load_sext_test4:
> ; SSSE3: # BB#0: # %entry
> -; SSSE3-NEXT: movswq 2(%rdi), %rax
> -; SSSE3-NEXT: movd %rax, %xmm1
> -; SSSE3-NEXT: movswq (%rdi), %rax
> -; SSSE3-NEXT: movd %rax, %xmm0
> -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
> +; SSSE3-NEXT: movd (%rdi), %xmm0
> +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
> +; SSSE3-NEXT: movdqa %xmm0, %xmm1
> +; SSSE3-NEXT: psrad $31, %xmm1
> +; SSSE3-NEXT: psrad $16, %xmm0
> +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
> ; SSSE3-NEXT: retq
> ;
> ; SSE41-LABEL: load_sext_test4:
> @@ -332,20 +282,18 @@ entry:
> define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) {
> ; SSE2-LABEL: load_sext_test5:
> ; SSE2: # BB#0: # %entry
> -; SSE2-NEXT: movslq 4(%rdi), %rax
> -; SSE2-NEXT: movd %rax, %xmm1
> -; SSE2-NEXT: movslq (%rdi), %rax
> -; SSE2-NEXT: movd %rax, %xmm0
> -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
> +; SSE2-NEXT: movq (%rdi), %xmm0
> +; SSE2-NEXT: movdqa %xmm0, %xmm1
> +; SSE2-NEXT: psrad $31, %xmm1
> +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
> ; SSE2-NEXT: retq
> ;
> ; SSSE3-LABEL: load_sext_test5:
> ; SSSE3: # BB#0: # %entry
> -; SSSE3-NEXT: movslq 4(%rdi), %rax
> -; SSSE3-NEXT: movd %rax, %xmm1
> -; SSSE3-NEXT: movslq (%rdi), %rax
> -; SSSE3-NEXT: movd %rax, %xmm0
> -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
> +; SSSE3-NEXT: movq (%rdi), %xmm0
> +; SSSE3-NEXT: movdqa %xmm0, %xmm1
> +; SSSE3-NEXT: psrad $31, %xmm1
> +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
> ; SSSE3-NEXT: retq
> ;
> ; SSE41-LABEL: load_sext_test5:
> @@ -410,72 +358,35 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x
> ; SSE2: # BB#0:
> ; SSE2-NEXT: pslld $31, %xmm0
> ; SSE2-NEXT: psrad $31, %xmm0
> -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
> -; SSE2-NEXT: movd %xmm1, %rax
> -; SSE2-NEXT: cltq
> -; SSE2-NEXT: movd %rax, %xmm2
> -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
> -; SSE2-NEXT: movd %xmm1, %rax
> -; SSE2-NEXT: cltq
> -; SSE2-NEXT: movd %rax, %xmm1
> -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
> -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
> -; SSE2-NEXT: movd %xmm0, %rax
> -; SSE2-NEXT: cltq
> -; SSE2-NEXT: movd %rax, %xmm1
> -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
> -; SSE2-NEXT: movd %xmm0, %rax
> -; SSE2-NEXT: cltq
> -; SSE2-NEXT: movd %rax, %xmm0
> -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
> -; SSE2-NEXT: movdqa %xmm2, %xmm0
> +; SSE2-NEXT: movdqa %xmm0, %xmm2
> +; SSE2-NEXT: psrad $31, %xmm2
> +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
> +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
> +; SSE2-NEXT: movdqa %xmm1, %xmm2
> +; SSE2-NEXT: psrad $31, %xmm2
> +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
> ; SSE2-NEXT: retq
> ;
> ; SSSE3-LABEL: sext_4i1_to_4i64:
> ; SSSE3: # BB#0:
> ; SSSE3-NEXT: pslld $31, %xmm0
> ; SSSE3-NEXT: psrad $31, %xmm0
> -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
> -; SSSE3-NEXT: movd %xmm1, %rax
> -; SSSE3-NEXT: cltq
> -; SSSE3-NEXT: movd %rax, %xmm2
> -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
> -; SSSE3-NEXT: movd %xmm1, %rax
> -; SSSE3-NEXT: cltq
> -; SSSE3-NEXT: movd %rax, %xmm1
> -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
> -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
> -; SSSE3-NEXT: movd %xmm0, %rax
> -; SSSE3-NEXT: cltq
> -; SSSE3-NEXT: movd %rax, %xmm1
> -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
> -; SSSE3-NEXT: movd %xmm0, %rax
> -; SSSE3-NEXT: cltq
> -; SSSE3-NEXT: movd %rax, %xmm0
> -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
> -; SSSE3-NEXT: movdqa %xmm2, %xmm0
> +; SSSE3-NEXT: movdqa %xmm0, %xmm2
> +; SSSE3-NEXT: psrad $31, %xmm2
> +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
> +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
> +; SSSE3-NEXT: movdqa %xmm1, %xmm2
> +; SSSE3-NEXT: psrad $31, %xmm2
> +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
> ; SSSE3-NEXT: retq
> ;
> ; SSE41-LABEL: sext_4i1_to_4i64:
> ; SSE41: # BB#0:
> ; SSE41-NEXT: pslld $31, %xmm0
> ; SSE41-NEXT: psrad $31, %xmm0
> -; SSE41-NEXT: pmovzxdq %xmm0, %xmm1
> -; SSE41-NEXT: pextrq $1, %xmm1, %rax
> -; SSE41-NEXT: cltq
> -; SSE41-NEXT: movd %rax, %xmm3
> -; SSE41-NEXT: movd %xmm1, %rax
> -; SSE41-NEXT: cltq
> -; SSE41-NEXT: movd %rax, %xmm2
> -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
> -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
> -; SSE41-NEXT: pextrq $1, %xmm0, %rax
> -; SSE41-NEXT: cltq
> -; SSE41-NEXT: movd %rax, %xmm3
> -; SSE41-NEXT: movd %xmm0, %rax
> -; SSE41-NEXT: cltq
> -; SSE41-NEXT: movd %rax, %xmm1
> -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
> +; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
> +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
> +; SSE41-NEXT: pmovsxdq %xmm0, %xmm1
> ; SSE41-NEXT: movdqa %xmm2, %xmm0
> ; SSE41-NEXT: retq
> ;
> @@ -500,20 +411,9 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x
> ; X32-SSE41: # BB#0:
> ; X32-SSE41-NEXT: pslld $31, %xmm0
> ; X32-SSE41-NEXT: psrad $31, %xmm0
> -; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2
> -; X32-SSE41-NEXT: movd %xmm2, %eax
> -; X32-SSE41-NEXT: sarl $31, %eax
> -; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx
> -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2
> -; X32-SSE41-NEXT: sarl $31, %ecx
> -; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2
> -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
> -; X32-SSE41-NEXT: movd %xmm1, %eax
> -; X32-SSE41-NEXT: sarl $31, %eax
> -; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx
> -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1
> -; X32-SSE41-NEXT: sarl $31, %ecx
> -; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1
> +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
> +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
> +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1
> ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
> ; X32-SSE41-NEXT: retl
> %extmask = sext <4 x i1> %mask to <4 x i64>
> @@ -576,72 +476,35 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x
> ; SSE2: # BB#0:
> ; SSE2-NEXT: pslld $24, %xmm0
> ; SSE2-NEXT: psrad $24, %xmm0
> -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
> -; SSE2-NEXT: movd %xmm1, %rax
> -; SSE2-NEXT: cltq
> -; SSE2-NEXT: movd %rax, %xmm2
> -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
> -; SSE2-NEXT: movd %xmm1, %rax
> -; SSE2-NEXT: cltq
> -; SSE2-NEXT: movd %rax, %xmm1
> -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
> -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
> -; SSE2-NEXT: movd %xmm0, %rax
> -; SSE2-NEXT: cltq
> -; SSE2-NEXT: movd %rax, %xmm1
> -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
> -; SSE2-NEXT: movd %xmm0, %rax
> -; SSE2-NEXT: cltq
> -; SSE2-NEXT: movd %rax, %xmm0
> -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
> -; SSE2-NEXT: movdqa %xmm2, %xmm0
> +; SSE2-NEXT: movdqa %xmm0, %xmm2
> +; SSE2-NEXT: psrad $31, %xmm2
> +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
> +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
> +; SSE2-NEXT: movdqa %xmm1, %xmm2
> +; SSE2-NEXT: psrad $31, %xmm2
> +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
> ; SSE2-NEXT: retq
> ;
> ; SSSE3-LABEL: sext_4i8_to_4i64:
> ; SSSE3: # BB#0:
> ; SSSE3-NEXT: pslld $24, %xmm0
> ; SSSE3-NEXT: psrad $24, %xmm0
> -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
> -; SSSE3-NEXT: movd %xmm1, %rax
> -; SSSE3-NEXT: cltq
> -; SSSE3-NEXT: movd %rax, %xmm2
> -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
> -; SSSE3-NEXT: movd %xmm1, %rax
> -; SSSE3-NEXT: cltq
> -; SSSE3-NEXT: movd %rax, %xmm1
> -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
> -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
> -; SSSE3-NEXT: movd %xmm0, %rax
> -; SSSE3-NEXT: cltq
> -; SSSE3-NEXT: movd %rax, %xmm1
> -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
> -; SSSE3-NEXT: movd %xmm0, %rax
> -; SSSE3-NEXT: cltq
> -; SSSE3-NEXT: movd %rax, %xmm0
> -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
> -; SSSE3-NEXT: movdqa %xmm2, %xmm0
> +; SSSE3-NEXT: movdqa %xmm0, %xmm2
> +; SSSE3-NEXT: psrad $31, %xmm2
> +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
> +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
> +; SSSE3-NEXT: movdqa %xmm1, %xmm2
> +; SSSE3-NEXT: psrad $31, %xmm2
> +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
> ; SSSE3-NEXT: retq
> ;
> ; SSE41-LABEL: sext_4i8_to_4i64:
> ; SSE41: # BB#0:
> ; SSE41-NEXT: pslld $24, %xmm0
> ; SSE41-NEXT: psrad $24, %xmm0
> -; SSE41-NEXT: pmovzxdq %xmm0, %xmm1
> -; SSE41-NEXT: pextrq $1, %xmm1, %rax
> -; SSE41-NEXT: cltq
> -; SSE41-NEXT: movd %rax, %xmm3
> -; SSE41-NEXT: movd %xmm1, %rax
> -; SSE41-NEXT: cltq
> -; SSE41-NEXT: movd %rax, %xmm2
> -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
> -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
> -; SSE41-NEXT: pextrq $1, %xmm0, %rax
> -; SSE41-NEXT: cltq
> -; SSE41-NEXT: movd %rax, %xmm3
> -; SSE41-NEXT: movd %xmm0, %rax
> -; SSE41-NEXT: cltq
> -; SSE41-NEXT: movd %rax, %xmm1
> -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
> +; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
> +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
> +; SSE41-NEXT: pmovsxdq %xmm0, %xmm1
> ; SSE41-NEXT: movdqa %xmm2, %xmm0
> ; SSE41-NEXT: retq
> ;
> @@ -666,20 +529,9 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x
> ; X32-SSE41: # BB#0:
> ; X32-SSE41-NEXT: pslld $24, %xmm0
> ; X32-SSE41-NEXT: psrad $24, %xmm0
> -; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2
> -; X32-SSE41-NEXT: movd %xmm2, %eax
> -; X32-SSE41-NEXT: sarl $31, %eax
> -; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx
> -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2
> -; X32-SSE41-NEXT: sarl $31, %ecx
> -; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2
> -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
> -; X32-SSE41-NEXT: movd %xmm1, %eax
> -; X32-SSE41-NEXT: sarl $31, %eax
> -; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx
> -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1
> -; X32-SSE41-NEXT: sarl $31, %ecx
> -; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1
> +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
> +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
> +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1
> ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
> ; X32-SSE41-NEXT: retl
> %extmask = sext <4 x i8> %mask to <4 x i64>
>
> Modified: llvm/trunk/test/CodeGen/X86/vselect-avx.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vselect-avx.ll?rev=237885&r1=237884&r2=237885&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/vselect-avx.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/vselect-avx.ll Thu May 21 05:05:03 2015
> @@ -14,8 +14,8 @@ target triple = "x86_64-apple-macosx"
> ; <rdar://problem/18675020>
>
> ; CHECK-LABEL: test:
> -; CHECK: vmovdqa {{.*#+}} xmm0 = [65535,0,0,65535]
> -; CHECK: vmovdqa {{.*#+}} xmm2 = [65533,124,125,14807]
> +; CHECK: vmovdqa {{.*#+}} xmm1 = [65533,124,125,14807]
> +; CHECK: vmovdqa {{.*#+}} xmm1 = [65535,0,0,65535]
> ; CHECK: ret
> define void @test(<4 x i16>* %a, <4 x i16>* %b) {
> body:
> @@ -33,13 +33,14 @@ body:
> ; of the condition.
> ;
> ; CHECK-LABEL: test2:
> -; CHECK: vpslld $31, %xmm0, %xmm0
> -; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1
> -; CHECK-NEXT: vpshufd $78, %xmm0, %xmm0 ## xmm0 = xmm0[2,3,0,1]
> -; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
> -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, [[MASK:%ymm[0-9]+]]
> -; CHECK: vblendvpd [[MASK]]
> -; CHECK: retq
> +; CHECK: vpslld $31, %xmm0, %xmm0
> +; CHECK-NEXT: vpsrad $31, %xmm0, %xmm0
> +; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1
> +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
> +; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
> +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, [[MASK:%ymm[0-9]+]]
> +; CHECK: vblendvpd [[MASK]]
> +; CHECK: retq
> define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) {
> bb:
> %arrayidx1928 = getelementptr inbounds double*, double** %call1559, i64 %indvars.iv4198
>
>
>
> From llvm-dev at redking.me.uk Thu May 21 05:09:02 2015
> From: llvm-dev at redking.me.uk (Simon Pilgrim)
> Date: Thu, 21 May 2015 10:09:02 +0000
> Subject: [PATCH] [X86][SSE] Improve support for 128-bit vector sign
> extension
> In-Reply-To: <differential-rev-PHID-DREV-v6d4a4rfprovxsxay5cu-req at reviews.llvm.org>
> References: <differential-rev-PHID-DREV-v6d4a4rfprovxsxay5cu-req at reviews.llvm.org>
> Message-ID: <1ac2e484efd8d9e72036190b7ff70c0f at localhost.localdomain>
>
> REPOSITORY
> rL LLVM
>
> http://reviews.llvm.org/D9848
>
> Files:
> llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
> llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
> llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> llvm/trunk/test/CodeGen/X86/pr15267.ll
> llvm/trunk/test/CodeGen/X86/vec_cast2.ll
> llvm/trunk/test/CodeGen/X86/vector-sext.ll
> llvm/trunk/test/CodeGen/X86/vselect-avx.ll
--
Jeroen Ketema - Senior Software Engineer, Compilers
Codeplay Software Ltd.
45 York Place, Edinburgh, EH1 3HP
Tel: 0131 466 0503
Fax: 0131 557 6600
Website: http://www.codeplay.com
Twitter: https://twitter.com/codeplaysoft
This email and any attachments may contain confidential and /or privileged
information and is for use by the addressee only. If you are not the intended
recipient, please notify Codeplay Software Ltd immediately and delete the message
from your computer. You may not copy or forward it,or use or disclose its contents
to any other person. Any views or other information in this message which do not
relate to our business are not authorized by Codeplay software Ltd, nor does this
message form part of any contract unless so stated.
As internet communications are capable of data corruption Codeplay Software Ltd does
not accept any responsibility for any changes made to this message after it was
sent. Please note that Codeplay Software Ltd does not accept any liability or
responsibility for viruses and it is your responsibility to scan any attachments.
Company registered in England and Wales, number: 04567874
Registered office: 81 Linkfield Street, Redhill RH1 6BY
More information about the llvm-commits
mailing list