Index: X86ISelLowering.h =================================================================== --- X86ISelLowering.h (revision 61100) +++ X86ISelLowering.h (working copy) @@ -600,6 +600,7 @@ SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG); SDValue LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG); + SDValue LowerVectorShift(SDValue Op, SelectionDAG &DAG); SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG); SDValue LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG); Index: X86ISelLowering.cpp =================================================================== --- X86ISelLowering.cpp (revision 61100) +++ X86ISelLowering.cpp (working copy) @@ -699,6 +700,23 @@ setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); + // Shifts are custom-lowered: we don't directly support + // any of these instructions, but there is a simple mapping + // in many cases when the RHS is a splat, and there + // are some tricks for constant shifts + setOperationAction(ISD::SHL, MVT::v16i8, Custom); + setOperationAction(ISD::SHL, MVT::v8i16, Custom); + setOperationAction(ISD::SHL, MVT::v4i32, Custom); + setOperationAction(ISD::SHL, MVT::v2i64, Custom); + setOperationAction(ISD::SRL, MVT::v16i8, Custom); + setOperationAction(ISD::SRL, MVT::v8i16, Custom); + setOperationAction(ISD::SRL, MVT::v4i32, Custom); + setOperationAction(ISD::SRL, MVT::v2i64, Custom); + setOperationAction(ISD::SRA, MVT::v16i8, Custom); + setOperationAction(ISD::SRA, MVT::v8i16, Custom); + setOperationAction(ISD::SRA, MVT::v4i32, Custom); + setOperationAction(ISD::SRA, MVT::v2i64, Custom); + setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); @@ -2795,13 +2813,42 @@ /// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are /// all the same. -static bool isSplatVector(SDNode *N) { - if (N->getOpcode() != ISD::BUILD_VECTOR) +static bool isSplatVector(const SDValue& N) { + // First, try to find a constant + SDNode* SearchNode = N.getNode(); + if (N.getOpcode() == ISD::BIT_CONVERT) + SearchNode = N.getOperand(0).getNode(); + if (LoadSDNode* Load = dyn_cast(SearchNode)) { + if (!Load->isIndexed() && + Load->getOperand(1).getOpcode() == X86ISD::Wrapper) { + SDNode* LoadedNode = Load->getOperand(1).getOperand(0).getNode(); + if (ConstantPoolSDNode* CPNode = + dyn_cast(LoadedNode)) { + if (!CPNode->isMachineConstantPoolEntry()) { + Constant* C = CPNode->getConstVal(); + if (C->getType() == N.getValueType().getTypeForMVT()) { + if (ConstantVector* V = dyn_cast(C)) { + return V->getSplatValue() != 0; + } + } + } + } + } + } + + // Look at shuffles + if (N.getOpcode() == ISD::VECTOR_SHUFFLE) { + // Grossly simplified check; this could be made more effective, + // but I don't know which cases are common + return isSplatMask(N.getOperand(2).getNode()); + } + + if (N.getOpcode() != ISD::BUILD_VECTOR) return false; - SDValue SplatValue = N->getOperand(0); - for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) - if (N->getOperand(i) != SplatValue) + SDValue SplatValue = N.getOperand(0); + for (unsigned i = 1, e = N.getNumOperands(); i != e; ++i) + if (N.getOperand(i) != SplatValue) return false; return true; } @@ -4087,8 +4134,8 @@ bool Commuted = false; // FIXME: This should also accept a bitcast of a splat? Be careful, not // 1,1,1,1 -> v8i16 though. - V1IsSplat = isSplatVector(V1.getNode()); - V2IsSplat = isSplatVector(V2.getNode()); + V1IsSplat = isSplatVector(V1); + V2IsSplat = isSplatVector(V2); // Canonicalize the splat or undef, if present, to be on the RHS. if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { @@ -6216,6 +6300,84 @@ return cpOut; } +SDValue X86TargetLowering::LowerVectorShift(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getValueType(); + if (isSplatVector(Op.getOperand(1))) { + if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16) { + // The easy case, with native support + // FIXME: This could be made a lot smarter about simplifying + // the shift input (for example, using the immediate form + // constants) + unsigned IntNo = 0; + if (Op.getOpcode() == ISD::SHL) { + if (VT == MVT::v2i64) IntNo = Intrinsic::x86_sse2_psll_q; + if (VT == MVT::v4i32) IntNo = Intrinsic::x86_sse2_psll_d; + if (VT == MVT::v8i16) IntNo = Intrinsic::x86_sse2_psll_w; + } else if (Op.getOpcode() == ISD::SRL) { + if (VT == MVT::v2i64) IntNo = Intrinsic::x86_sse2_psrl_q; + if (VT == MVT::v4i32) IntNo = Intrinsic::x86_sse2_psrl_d; + if (VT == MVT::v8i16) IntNo = Intrinsic::x86_sse2_psrl_w; + } else if (Op.getOpcode() == ISD::SRA) { + if (VT == MVT::v4i32) IntNo = Intrinsic::x86_sse2_psra_d; + if (VT == MVT::v8i16) IntNo = Intrinsic::x86_sse2_psra_w; + } + + SDValue ShiftAmount; + ShiftAmount = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, + VT.getVectorElementType(), + Op.getOperand(1), + DAG.getConstant(0, getPointerTy())); + ShiftAmount = DAG.getNode(ISD::INSERT_VECTOR_ELT, VT, + DAG.getConstant(0, VT), + ShiftAmount, + DAG.getConstant(0, getPointerTy())); + + if (IntNo) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, VT, + DAG.getConstant(IntNo, MVT::i32), + Op.getOperand(0), ShiftAmount); + } + + // FIXME: Add v16i8 shifts (not directly supported, but + // can be done reasonably cheaply, especially if the + // amount is constant). + } + + // FIXME: We can support constant v8i16 non-splat shift amounts using + // multiplication: left shifts using mul, right shifts + // using PMULH(U)W. (The only tricky bit is right-shifts with + // zero components, for which an extra blend is needed.) + + // FIXME: If we have SSE4.1, we can transform constant v4i32 + // non-splat left shifts into PMULLD. + + // FIXME: If there are only a couple of different shift amounts, + // we can do two shifts and blend them together (although + // we'll have to measure when this ends up being a speed boost). + + // FIXME: Tricks for <1,1,1...> << vec (not especially fast, but + // might be faster than scalarizing): + // + // We can do <4 x i32><1,1,1...> << <4 x i32> in only 4 + // operations using fptosi (i.e. CVTTPS2DQ). And CVTTPS2DQ is a + // surprisingly fast instruction. + // + // We can do use the above plus some reorganization to do + // the above for a <8 x i16> in about 8 operations. + // + // We can do a parallel binary search with pcmp and masking to do + // <16 x i8> <1,1,1...> << var + // in about 10 operations with about 6 constants. It's probably + // not worthwhile for larger shifts. + // + // If we have SSSE3, <16 x i8> <1,1,1...> << var can be + // done in a single PSHUFB with a constant like <16 x i8> <1,2,4,8,...>. + // <8 x i16> <1,1,1...> << var can be done similarly with a few more + // operations. + + return SDValue(); +} + SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) { assert(Subtarget->is64Bit() && "Result not type legalized?"); @@ -6305,6 +6467,9 @@ case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::CTLZ: return LowerCTLZ(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op, DAG); + case ISD::SHL: + case ISD::SRL: + case ISD::SRA: return LowerVectorShift(Op, DAG); case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: