[llvm] r348353 - [SelectionDAG] Initial support for FSHL/FSHR funnel shift opcodes (PR39467)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 5 03:12:12 PST 2018
Author: rksimon
Date: Wed Dec 5 03:12:12 2018
New Revision: 348353
URL: http://llvm.org/viewvc/llvm-project?rev=348353&view=rev
Log:
[SelectionDAG] Initial support for FSHL/FSHR funnel shift opcodes (PR39467)
This is an initial patch to add a minimum level of support for funnel shifts to the SelectionDAG and to begin wiring it up to the X86 SHLD/SHRD instructions.
Some partial legalization code has been added to handle the case for 'SlowSHLD' where we want to expand instead and I've added a few DAG combines so we don't get regressions from the existing DAG builder expansion code.
Differential Revision: https://reviews.llvm.org/D54698
Modified:
llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h
llvm/trunk/include/llvm/CodeGen/TargetLowering.h
llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
llvm/trunk/test/CodeGen/X86/fshl.ll
llvm/trunk/test/CodeGen/X86/fshr.ll
llvm/trunk/test/CodeGen/X86/funnel-shift.ll
Modified: llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h?rev=348353&r1=348352&r2=348353&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h (original)
+++ llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h Wed Dec 5 03:12:12 2018
@@ -394,9 +394,13 @@ namespace ISD {
/// When the 1st operand is a vector, the shift amount must be in the same
/// type. (TLI.getShiftAmountTy() will return the same type when the input
/// type is a vector.)
- /// For rotates, the shift amount is treated as an unsigned amount modulo
- /// the element size of the first operand.
- SHL, SRA, SRL, ROTL, ROTR,
+ /// For rotates and funnel shifts, the shift amount is treated as an unsigned
+ /// amount modulo the element size of the first operand.
+ ///
+ /// Funnel 'double' shifts take 3 operands, 2 inputs and the shift amount.
+ /// fshl(X,Y,Z): (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
+ /// fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
+ SHL, SRA, SRL, ROTL, ROTR, FSHL, FSHR,
/// Byte Swap and Counting operators.
BSWAP, CTTZ, CTLZ, CTPOP, BITREVERSE,
Modified: llvm/trunk/include/llvm/CodeGen/TargetLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/TargetLowering.h?rev=348353&r1=348352&r2=348353&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/TargetLowering.h (original)
+++ llvm/trunk/include/llvm/CodeGen/TargetLowering.h Wed Dec 5 03:12:12 2018
@@ -3689,6 +3689,12 @@ public:
SDValue LL = SDValue(), SDValue LH = SDValue(),
SDValue RL = SDValue(), SDValue RH = SDValue()) const;
+ /// Expand funnel shift.
+ /// \param N Node to expand
+ /// \param Result output after conversion
+ /// \returns True, if the expansion was successful, false otherwise
+ bool expandFunnelShift(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
+
/// Expand float(f32) to SINT(i64) conversion
/// \param N Node to expand
/// \param Result output after conversion
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=348353&r1=348352&r2=348353&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Wed Dec 5 03:12:12 2018
@@ -325,6 +325,7 @@ namespace {
SDValue visitSHL(SDNode *N);
SDValue visitSRA(SDNode *N);
SDValue visitSRL(SDNode *N);
+ SDValue visitFunnelShift(SDNode *N);
SDValue visitRotate(SDNode *N);
SDValue visitABS(SDNode *N);
SDValue visitBSWAP(SDNode *N);
@@ -1513,6 +1514,8 @@ SDValue DAGCombiner::visit(SDNode *N) {
case ISD::SRL: return visitSRL(N);
case ISD::ROTR:
case ISD::ROTL: return visitRotate(N);
+ case ISD::FSHL:
+ case ISD::FSHR: return visitFunnelShift(N);
case ISD::ABS: return visitABS(N);
case ISD::BSWAP: return visitBSWAP(N);
case ISD::BITREVERSE: return visitBITREVERSE(N);
@@ -6925,6 +6928,39 @@ SDValue DAGCombiner::visitSRL(SDNode *N)
return SDValue();
}
+
+SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue N2 = N->getOperand(2);
+ bool IsFSHL = N->getOpcode() == ISD::FSHL;
+ unsigned BitWidth = VT.getScalarSizeInBits();
+
+ // fold (fshl N0, N1, 0) -> N0
+ // fold (fshr N0, N1, 0) -> N1
+ if (DAG.MaskedValueIsZero(N2, APInt::getAllOnesValue(BitWidth)))
+ return IsFSHL ? N0 : N1;
+
+ // fold (fsh* N0, N1, c) -> (fsh* N0, N1, c % BitWidth)
+ if (ConstantSDNode *Cst = isConstOrConstSplat(N2)) {
+ if (Cst->getAPIntValue().uge(BitWidth)) {
+ uint64_t RotAmt = Cst->getAPIntValue().urem(BitWidth);
+ return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0, N1,
+ DAG.getConstant(RotAmt, SDLoc(N), N2.getValueType()));
+ }
+ }
+
+ // fold (fshl N0, N0, N2) -> (rotl N0, N2)
+ // fold (fshr N0, N0, N2) -> (rotr N0, N2)
+ // TODO: Investigate flipping this rotate if only one is legal, if funnel shift
+ // is legal as well we might be better off avoiding non-constant (BW - N2).
+ unsigned RotOpc = IsFSHL ? ISD::ROTL : ISD::ROTR;
+ if (N0 == N1 && hasOperation(RotOpc, VT))
+ return DAG.getNode(RotOpc, SDLoc(N), VT, N0, N2);
+
+ return SDValue();
+}
SDValue DAGCombiner::visitABS(SDNode *N) {
SDValue N0 = N->getOperand(0);
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp?rev=348353&r1=348352&r2=348353&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp Wed Dec 5 03:12:12 2018
@@ -1170,6 +1170,8 @@ void SelectionDAGLegalize::LegalizeOp(SD
}
}
break;
+ case ISD::FSHL:
+ case ISD::FSHR:
case ISD::SRL_PARTS:
case ISD::SRA_PARTS:
case ISD::SHL_PARTS: {
@@ -3262,6 +3264,11 @@ bool SelectionDAGLegalize::ExpandNode(SD
}
break;
}
+ case ISD::FSHL:
+ case ISD::FSHR:
+ if (TLI.expandFunnelShift(Node, Tmp1, DAG))
+ Results.push_back(Tmp1);
+ break;
case ISD::SADDSAT:
case ISD::UADDSAT:
case ISD::SSUBSAT:
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp?rev=348353&r1=348352&r2=348353&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp Wed Dec 5 03:12:12 2018
@@ -129,12 +129,13 @@ class VectorLegalizer {
SDValue ExpandFNEG(SDValue Op);
SDValue ExpandFSUB(SDValue Op);
SDValue ExpandBITREVERSE(SDValue Op);
- SDValue ExpandCTPOP(SDValue Op);
- SDValue ExpandCTLZ(SDValue Op);
- SDValue ExpandCTTZ(SDValue Op);
- SDValue ExpandFMINNUM_FMAXNUM(SDValue Op);
- SDValue ExpandStrictFPOp(SDValue Op);
-
+ SDValue ExpandCTPOP(SDValue Op);
+ SDValue ExpandCTLZ(SDValue Op);
+ SDValue ExpandCTTZ(SDValue Op);
+ SDValue ExpandFunnelShift(SDValue Op);
+ SDValue ExpandFMINNUM_FMAXNUM(SDValue Op);
+ SDValue ExpandStrictFPOp(SDValue Op);
+
/// Implements vector promotion.
///
/// This is essentially just bitcasting the operands to a different type and
@@ -746,12 +747,15 @@ SDValue VectorLegalizer::Expand(SDValue
case ISD::CTLZ:
case ISD::CTLZ_ZERO_UNDEF:
return ExpandCTLZ(Op);
- case ISD::CTTZ:
- case ISD::CTTZ_ZERO_UNDEF:
- return ExpandCTTZ(Op);
- case ISD::FMINNUM:
- case ISD::FMAXNUM:
- return ExpandFMINNUM_FMAXNUM(Op);
+ case ISD::CTTZ:
+ case ISD::CTTZ_ZERO_UNDEF:
+ return ExpandCTTZ(Op);
+ case ISD::FSHL:
+ case ISD::FSHR:
+ return ExpandFunnelShift(Op);
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM:
+ return ExpandFMINNUM_FMAXNUM(Op);
case ISD::STRICT_FADD:
case ISD::STRICT_FSUB:
case ISD::STRICT_FMUL:
@@ -1123,32 +1127,40 @@ SDValue VectorLegalizer::ExpandFSUB(SDVa
return Op; // Defer to LegalizeDAG
return DAG.UnrollVectorOp(Op.getNode());
-}
-
-SDValue VectorLegalizer::ExpandCTPOP(SDValue Op) {
- SDValue Result;
- if (TLI.expandCTPOP(Op.getNode(), Result, DAG))
- return Result;
-
- return DAG.UnrollVectorOp(Op.getNode());
-}
-
-SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
- SDValue Result;
- if (TLI.expandCTLZ(Op.getNode(), Result, DAG))
- return Result;
-
- return DAG.UnrollVectorOp(Op.getNode());
-}
-
-SDValue VectorLegalizer::ExpandCTTZ(SDValue Op) {
- SDValue Result;
- if (TLI.expandCTTZ(Op.getNode(), Result, DAG))
- return Result;
-
- return DAG.UnrollVectorOp(Op.getNode());
-}
-
+}
+
+SDValue VectorLegalizer::ExpandCTPOP(SDValue Op) {
+ SDValue Result;
+ if (TLI.expandCTPOP(Op.getNode(), Result, DAG))
+ return Result;
+
+ return DAG.UnrollVectorOp(Op.getNode());
+}
+
+SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
+ SDValue Result;
+ if (TLI.expandCTLZ(Op.getNode(), Result, DAG))
+ return Result;
+
+ return DAG.UnrollVectorOp(Op.getNode());
+}
+
+SDValue VectorLegalizer::ExpandCTTZ(SDValue Op) {
+ SDValue Result;
+ if (TLI.expandCTTZ(Op.getNode(), Result, DAG))
+ return Result;
+
+ return DAG.UnrollVectorOp(Op.getNode());
+}
+
+SDValue VectorLegalizer::ExpandFunnelShift(SDValue Op) {
+ SDValue Result;
+ if (TLI.expandFunnelShift(Op.getNode(), Result, DAG))
+ return Result;
+
+ return DAG.UnrollVectorOp(Op.getNode());
+}
+
SDValue VectorLegalizer::ExpandFMINNUM_FMAXNUM(SDValue Op) {
if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Op.getNode(), DAG))
return Expanded;
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp?rev=348353&r1=348352&r2=348353&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp Wed Dec 5 03:12:12 2018
@@ -5751,6 +5751,12 @@ SelectionDAGBuilder::visitIntrinsicCall(
SDValue Zero = DAG.getConstant(0, sdl, VT);
SDValue ShAmt = DAG.getNode(ISD::UREM, sdl, VT, Z, BitWidthC);
+ auto FunnelOpcode = IsFSHL ? ISD::FSHL : ISD::FSHR;
+ if (TLI.isOperationLegalOrCustom(FunnelOpcode, VT)) {
+ setValue(&I, DAG.getNode(FunnelOpcode, sdl, VT, X, Y, Z));
+ return nullptr;
+ }
+
// When X == Y, this is rotate. If the data type has a power-of-2 size, we
// avoid the select that is necessary in the general case to filter out
// the 0-shift possibility that leads to UB.
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp?rev=348353&r1=348352&r2=348353&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp Wed Dec 5 03:12:12 2018
@@ -237,6 +237,8 @@ std::string SDNode::getOperationName(con
case ISD::SRL: return "srl";
case ISD::ROTL: return "rotl";
case ISD::ROTR: return "rotr";
+ case ISD::FSHL: return "fshl";
+ case ISD::FSHR: return "fshr";
case ISD::FADD: return "fadd";
case ISD::STRICT_FADD: return "strict_fadd";
case ISD::FSUB: return "fsub";
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp?rev=348353&r1=348352&r2=348353&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp Wed Dec 5 03:12:12 2018
@@ -4114,6 +4114,54 @@ bool TargetLowering::expandMUL(SDNode *N
return Ok;
}
+bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result,
+ SelectionDAG &DAG) const {
+ EVT VT = Node->getValueType(0);
+
+ if (VT.isVector() && (!isOperationLegalOrCustom(ISD::SHL, VT) ||
+ !isOperationLegalOrCustom(ISD::SRL, VT) ||
+ !isOperationLegalOrCustom(ISD::SUB, VT) ||
+ !isOperationLegalOrCustomOrPromote(ISD::OR, VT)))
+ return false;
+
+ // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
+ // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
+ SDValue X = Node->getOperand(0);
+ SDValue Y = Node->getOperand(1);
+ SDValue Z = Node->getOperand(2);
+
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ bool IsFSHL = Node->getOpcode() == ISD::FSHL;
+ SDLoc DL(SDValue(Node, 0));
+
+ EVT ShVT = Z.getValueType();
+ SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT);
+ SDValue Zero = DAG.getConstant(0, DL, ShVT);
+
+ SDValue ShAmt;
+ if (isPowerOf2_32(EltSizeInBits)) {
+ SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, ShVT);
+ ShAmt = DAG.getNode(ISD::AND, DL, ShVT, Z, Mask);
+ } else {
+ ShAmt = DAG.getNode(ISD::UREM, DL, ShVT, Z, BitWidthC);
+ }
+
+ SDValue InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, ShAmt);
+ SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, X, IsFSHL ? ShAmt : InvShAmt);
+ SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Y, IsFSHL ? InvShAmt : ShAmt);
+ SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
+
+ // If (Z % BW == 0), then the opposite direction shift is shift-by-bitwidth,
+ // and that is undefined. We must compare and select to avoid UB.
+ EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ShVT);
+
+ // For fshl, 0-shift returns the 1st arg (X).
+ // For fshr, 0-shift returns the 2nd arg (Y).
+ SDValue IsZeroShift = DAG.getSetCC(DL, CCVT, ShAmt, Zero, ISD::SETEQ);
+ Result = DAG.getSelect(DL, VT, IsZeroShift, IsFSHL ? X : Y, Or);
+ return true;
+}
+
bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
SelectionDAG &DAG) const {
SDValue Src = Node->getOperand(0);
Modified: llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp?rev=348353&r1=348352&r2=348353&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp (original)
+++ llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp Wed Dec 5 03:12:12 2018
@@ -610,6 +610,8 @@ void TargetLoweringBase::initActions() {
setOperationAction(ISD::UMIN, VT, Expand);
setOperationAction(ISD::UMAX, VT, Expand);
setOperationAction(ISD::ABS, VT, Expand);
+ setOperationAction(ISD::FSHL, VT, Expand);
+ setOperationAction(ISD::FSHR, VT, Expand);
setOperationAction(ISD::SADDSAT, VT, Expand);
setOperationAction(ISD::UADDSAT, VT, Expand);
setOperationAction(ISD::SSUBSAT, VT, Expand);
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=348353&r1=348352&r2=348353&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Dec 5 03:12:12 2018
@@ -195,6 +195,14 @@ X86TargetLowering::X86TargetLowering(con
setOperationAction(ISD::ABS , MVT::i64 , Custom);
}
+ // Funnel shifts.
+ for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
+ setOperationAction(ShiftOp , MVT::i16 , Custom);
+ setOperationAction(ShiftOp , MVT::i32 , Custom);
+ if (Subtarget.is64Bit())
+ setOperationAction(ShiftOp , MVT::i64 , Custom);
+ }
+
// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
// operation.
setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
@@ -16972,6 +16980,7 @@ X86TargetLowering::LowerGlobalTLSAddress
/// Lower SRA_PARTS and friends, which return two i32 values
/// and take a 2 x i32 value to shift plus a shift amount.
+/// TODO: Can this be moved to general expansion code?
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
MVT VT = Op.getSimpleValueType();
@@ -16981,8 +16990,8 @@ static SDValue LowerShiftParts(SDValue O
SDValue ShOpLo = Op.getOperand(0);
SDValue ShOpHi = Op.getOperand(1);
SDValue ShAmt = Op.getOperand(2);
- // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
- // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
+ // ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and
+ // ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away
// during isel.
SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
DAG.getConstant(VTBits - 1, dl, MVT::i8));
@@ -16992,10 +17001,10 @@ static SDValue LowerShiftParts(SDValue O
SDValue Tmp2, Tmp3;
if (Op.getOpcode() == ISD::SHL_PARTS) {
- Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
+ Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt);
Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
} else {
- Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
+ Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt);
Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
}
@@ -17019,6 +17028,37 @@ static SDValue LowerShiftParts(SDValue O
return DAG.getMergeValues({ Lo, Hi }, dl);
}
+static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
+ "Unexpected funnel shift opcode!");
+ assert((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
+ "Unexpected funnel shift type!");
+
+ SDLoc DL(Op);
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue Amt = Op.getOperand(2);
+
+ // Expand slow SHLD/SHRD cases.
+ // TODO - can we be more selective here: OptSize/RMW etc.?
+ if (Subtarget.isSHLDSlow())
+ return SDValue();
+
+ bool IsFSHR = Op.getOpcode() == ISD::FSHR;
+ if (IsFSHR)
+ std::swap(Op0, Op1);
+
+ // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
+ if (VT == MVT::i16)
+ Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
+ DAG.getConstant(15, DL, Amt.getValueType()));
+
+ unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD);
+ return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt);
+}
+
// Try to use a packed vector operation to handle i64 on 32-bit targets when
// AVX512DQ is enabled.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
@@ -26115,6 +26155,8 @@ SDValue X86TargetLowering::LowerOperatio
case ISD::SHL_PARTS:
case ISD::SRA_PARTS:
case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
+ case ISD::FSHL:
+ case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
Modified: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp?rev=348353&r1=348352&r2=348353&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp Wed Dec 5 03:12:12 2018
@@ -2022,7 +2022,7 @@ int X86TTIImpl::getIntrinsicInstrCost(In
static const CostTblEntry X64CostTbl[] = { // 64-bit targets
{ ISD::ROTL, MVT::i64, 1 },
{ ISD::ROTR, MVT::i64, 1 },
- { X86ISD::SHLD, MVT::i64, 4 }
+ { ISD::FSHL, MVT::i64, 4 }
};
static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
{ ISD::ROTL, MVT::i32, 1 },
@@ -2031,9 +2031,9 @@ int X86TTIImpl::getIntrinsicInstrCost(In
{ ISD::ROTR, MVT::i32, 1 },
{ ISD::ROTR, MVT::i16, 1 },
{ ISD::ROTR, MVT::i8, 1 },
- { X86ISD::SHLD, MVT::i32, 4 },
- { X86ISD::SHLD, MVT::i16, 4 },
- { X86ISD::SHLD, MVT::i8, 4 }
+ { ISD::FSHL, MVT::i32, 4 },
+ { ISD::FSHL, MVT::i16, 4 },
+ { ISD::FSHL, MVT::i8, 4 }
};
unsigned ISD = ISD::DELETED_NODE;
@@ -2041,13 +2041,13 @@ int X86TTIImpl::getIntrinsicInstrCost(In
default:
break;
case Intrinsic::fshl:
- ISD = X86ISD::SHLD;
+ ISD = ISD::FSHL;
if (Args[0] == Args[1])
ISD = ISD::ROTL;
break;
case Intrinsic::fshr:
- // SHRD has same costs so don't duplicate.
- ISD = X86ISD::SHLD;
+ // FSHR has same costs so don't duplicate.
+ ISD = ISD::FSHL;
if (Args[0] == Args[1])
ISD = ISD::ROTR;
break;
Modified: llvm/trunk/test/CodeGen/X86/fshl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fshl.ll?rev=348353&r1=348352&r2=348353&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fshl.ll (original)
+++ llvm/trunk/test/CodeGen/X86/fshl.ll Wed Dec 5 03:12:12 2018
@@ -58,20 +58,11 @@ define i8 @var_shift_i8(i8 %x, i8 %y, i8
define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
; X86-FAST-LABEL: var_shift_i16:
; X86-FAST: # %bb.0:
-; X86-FAST-NEXT: pushl %esi
-; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %esi
-; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
-; X86-FAST-NEXT: andl $15, %ecx
-; X86-FAST-NEXT: movl %eax, %edx
-; X86-FAST-NEXT: shldw %cl, %si, %dx
-; X86-FAST-NEXT: testw %cx, %cx
-; X86-FAST-NEXT: je .LBB1_2
-; X86-FAST-NEXT: # %bb.1:
-; X86-FAST-NEXT: movl %edx, %eax
-; X86-FAST-NEXT: .LBB1_2:
-; X86-FAST-NEXT: # kill: def $ax killed $ax killed $eax
-; X86-FAST-NEXT: popl %esi
+; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-FAST-NEXT: andb $15, %cl
+; X86-FAST-NEXT: shldw %cl, %dx, %ax
; X86-FAST-NEXT: retl
;
; X86-SLOW-LABEL: var_shift_i16:
@@ -79,17 +70,16 @@ define i16 @var_shift_i16(i16 %x, i16 %y
; X86-SLOW-NEXT: pushl %edi
; X86-SLOW-NEXT: pushl %esi
; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT: andl $15, %edx
+; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl
+; X86-SLOW-NEXT: andb $15, %dl
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: movl %eax, %edi
; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: shll %cl, %edi
-; X86-SLOW-NEXT: movl $16, %ecx
-; X86-SLOW-NEXT: subl %edx, %ecx
-; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT: movb $16, %cl
+; X86-SLOW-NEXT: subb %dl, %cl
; X86-SLOW-NEXT: shrl %cl, %esi
-; X86-SLOW-NEXT: testw %dx, %dx
+; X86-SLOW-NEXT: testb %dl, %dl
; X86-SLOW-NEXT: je .LBB1_2
; X86-SLOW-NEXT: # %bb.1:
; X86-SLOW-NEXT: orl %esi, %edi
@@ -103,27 +93,25 @@ define i16 @var_shift_i16(i16 %x, i16 %y
; X64-FAST-LABEL: var_shift_i16:
; X64-FAST: # %bb.0:
; X64-FAST-NEXT: movl %edx, %ecx
-; X64-FAST-NEXT: andl $15, %ecx
; X64-FAST-NEXT: movl %edi, %eax
+; X64-FAST-NEXT: andb $15, %cl
+; X64-FAST-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-FAST-NEXT: shldw %cl, %si, %ax
-; X64-FAST-NEXT: testw %cx, %cx
-; X64-FAST-NEXT: cmovel %edi, %eax
; X64-FAST-NEXT: # kill: def $ax killed $ax killed $eax
; X64-FAST-NEXT: retq
;
; X64-SLOW-LABEL: var_shift_i16:
; X64-SLOW: # %bb.0:
; X64-SLOW-NEXT: movzwl %si, %eax
-; X64-SLOW-NEXT: andl $15, %edx
+; X64-SLOW-NEXT: andb $15, %dl
; X64-SLOW-NEXT: movl %edi, %esi
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: shll %cl, %esi
-; X64-SLOW-NEXT: movl $16, %ecx
-; X64-SLOW-NEXT: subl %edx, %ecx
-; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-SLOW-NEXT: movb $16, %cl
+; X64-SLOW-NEXT: subb %dl, %cl
; X64-SLOW-NEXT: shrl %cl, %eax
; X64-SLOW-NEXT: orl %esi, %eax
-; X64-SLOW-NEXT: testw %dx, %dx
+; X64-SLOW-NEXT: testb %dl, %dl
; X64-SLOW-NEXT: cmovel %edi, %eax
; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
; X64-SLOW-NEXT: retq
@@ -134,19 +122,10 @@ define i16 @var_shift_i16(i16 %x, i16 %y
define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
; X86-FAST-LABEL: var_shift_i32:
; X86-FAST: # %bb.0:
-; X86-FAST-NEXT: pushl %esi
-; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-FAST-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-FAST-NEXT: andl $31, %ecx
-; X86-FAST-NEXT: movl %eax, %edx
-; X86-FAST-NEXT: shldl %cl, %esi, %edx
-; X86-FAST-NEXT: testl %ecx, %ecx
-; X86-FAST-NEXT: je .LBB2_2
-; X86-FAST-NEXT: # %bb.1:
-; X86-FAST-NEXT: movl %edx, %eax
-; X86-FAST-NEXT: .LBB2_2:
-; X86-FAST-NEXT: popl %esi
+; X86-FAST-NEXT: shldl %cl, %edx, %eax
; X86-FAST-NEXT: retl
;
; X86-SLOW-LABEL: var_shift_i32:
@@ -154,17 +133,16 @@ define i32 @var_shift_i32(i32 %x, i32 %y
; X86-SLOW-NEXT: pushl %edi
; X86-SLOW-NEXT: pushl %esi
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT: andl $31, %edx
; X86-SLOW-NEXT: movl %eax, %edi
; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: shll %cl, %edi
+; X86-SLOW-NEXT: andb $31, %dl
; X86-SLOW-NEXT: movl %edx, %ecx
-; X86-SLOW-NEXT: negl %ecx
-; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT: negb %cl
; X86-SLOW-NEXT: shrl %cl, %esi
-; X86-SLOW-NEXT: testl %edx, %edx
+; X86-SLOW-NEXT: testb %dl, %dl
; X86-SLOW-NEXT: je .LBB2_2
; X86-SLOW-NEXT: # %bb.1:
; X86-SLOW-NEXT: orl %esi, %edi
@@ -177,26 +155,23 @@ define i32 @var_shift_i32(i32 %x, i32 %y
; X64-FAST-LABEL: var_shift_i32:
; X64-FAST: # %bb.0:
; X64-FAST-NEXT: movl %edx, %ecx
-; X64-FAST-NEXT: andl $31, %ecx
; X64-FAST-NEXT: movl %edi, %eax
+; X64-FAST-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-FAST-NEXT: shldl %cl, %esi, %eax
-; X64-FAST-NEXT: testl %ecx, %ecx
-; X64-FAST-NEXT: cmovel %edi, %eax
; X64-FAST-NEXT: retq
;
; X64-SLOW-LABEL: var_shift_i32:
; X64-SLOW: # %bb.0:
; X64-SLOW-NEXT: movl %esi, %eax
-; X64-SLOW-NEXT: andl $31, %edx
; X64-SLOW-NEXT: movl %edi, %esi
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: shll %cl, %esi
+; X64-SLOW-NEXT: andb $31, %dl
; X64-SLOW-NEXT: movl %edx, %ecx
-; X64-SLOW-NEXT: negl %ecx
-; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-SLOW-NEXT: negb %cl
; X64-SLOW-NEXT: shrl %cl, %eax
; X64-SLOW-NEXT: orl %esi, %eax
-; X64-SLOW-NEXT: testl %edx, %edx
+; X64-SLOW-NEXT: testb %dl, %dl
; X64-SLOW-NEXT: cmovel %edi, %eax
; X64-SLOW-NEXT: retq
%tmp = tail call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
@@ -204,85 +179,166 @@ define i32 @var_shift_i32(i32 %x, i32 %y
}
define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
-; X86-LABEL: var_shift_i64:
-; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: pushl %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: andl $63, %ebx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: movl %ebx, %ecx
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: shldl %cl, %eax, %ebp
-; X86-NEXT: testb $32, %bl
-; X86-NEXT: je .LBB3_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %edi, %ebp
-; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: .LBB3_2:
-; X86-NEXT: movb $64, %cl
-; X86-NEXT: subb %bl, %cl
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: shrl %cl, %esi
-; X86-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NEXT: testb $32, %cl
-; X86-NEXT: jne .LBB3_3
-; X86-NEXT: # %bb.4:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT: testl %ebx, %ebx
-; X86-NEXT: jne .LBB3_6
-; X86-NEXT: jmp .LBB3_7
-; X86-NEXT: .LBB3_3:
-; X86-NEXT: movl %esi, %ecx
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: testl %ebx, %ebx
-; X86-NEXT: je .LBB3_7
-; X86-NEXT: .LBB3_6:
-; X86-NEXT: orl %esi, %ebp
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl %ebp, %edx
-; X86-NEXT: .LBB3_7:
-; X86-NEXT: addl $4, %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl
+; X86-FAST-LABEL: var_shift_i64:
+; X86-FAST: # %bb.0:
+; X86-FAST-NEXT: pushl %ebp
+; X86-FAST-NEXT: pushl %ebx
+; X86-FAST-NEXT: pushl %edi
+; X86-FAST-NEXT: pushl %esi
+; X86-FAST-NEXT: pushl %eax
+; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-FAST-NEXT: andl $63, %ebx
+; X86-FAST-NEXT: movl %eax, %edi
+; X86-FAST-NEXT: movl %ebx, %ecx
+; X86-FAST-NEXT: shll %cl, %edi
+; X86-FAST-NEXT: shldl %cl, %eax, %ebp
+; X86-FAST-NEXT: testb $32, %bl
+; X86-FAST-NEXT: je .LBB3_2
+; X86-FAST-NEXT: # %bb.1:
+; X86-FAST-NEXT: movl %edi, %ebp
+; X86-FAST-NEXT: xorl %edi, %edi
+; X86-FAST-NEXT: .LBB3_2:
+; X86-FAST-NEXT: movb $64, %cl
+; X86-FAST-NEXT: subb %bl, %cl
+; X86-FAST-NEXT: movl %edx, %esi
+; X86-FAST-NEXT: shrl %cl, %esi
+; X86-FAST-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
+; X86-FAST-NEXT: testb $32, %cl
+; X86-FAST-NEXT: jne .LBB3_3
+; X86-FAST-NEXT: # %bb.4:
+; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-FAST-NEXT: testl %ebx, %ebx
+; X86-FAST-NEXT: jne .LBB3_6
+; X86-FAST-NEXT: jmp .LBB3_7
+; X86-FAST-NEXT: .LBB3_3:
+; X86-FAST-NEXT: movl %esi, %ecx
+; X86-FAST-NEXT: xorl %esi, %esi
+; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT: testl %ebx, %ebx
+; X86-FAST-NEXT: je .LBB3_7
+; X86-FAST-NEXT: .LBB3_6:
+; X86-FAST-NEXT: orl %esi, %ebp
+; X86-FAST-NEXT: orl %ecx, %edi
+; X86-FAST-NEXT: movl %edi, %eax
+; X86-FAST-NEXT: movl %ebp, %edx
+; X86-FAST-NEXT: .LBB3_7:
+; X86-FAST-NEXT: addl $4, %esp
+; X86-FAST-NEXT: popl %esi
+; X86-FAST-NEXT: popl %edi
+; X86-FAST-NEXT: popl %ebx
+; X86-FAST-NEXT: popl %ebp
+; X86-FAST-NEXT: retl
+;
+; X86-SLOW-LABEL: var_shift_i64:
+; X86-SLOW: # %bb.0:
+; X86-SLOW-NEXT: pushl %ebp
+; X86-SLOW-NEXT: pushl %ebx
+; X86-SLOW-NEXT: pushl %edi
+; X86-SLOW-NEXT: pushl %esi
+; X86-SLOW-NEXT: subl $8, %esp
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-SLOW-NEXT: andl $63, %ebx
+; X86-SLOW-NEXT: movb $64, %dh
+; X86-SLOW-NEXT: subb %bl, %dh
+; X86-SLOW-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT: movb %dh, %cl
+; X86-SLOW-NEXT: shrl %cl, %eax
+; X86-SLOW-NEXT: movb %dh, %dl
+; X86-SLOW-NEXT: andb $31, %dl
+; X86-SLOW-NEXT: movl %edx, %ecx
+; X86-SLOW-NEXT: negb %cl
+; X86-SLOW-NEXT: movl %esi, %ebp
+; X86-SLOW-NEXT: shll %cl, %ebp
+; X86-SLOW-NEXT: testb %dl, %dl
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT: je .LBB3_2
+; X86-SLOW-NEXT: # %bb.1:
+; X86-SLOW-NEXT: orl %eax, %ebp
+; X86-SLOW-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT: .LBB3_2:
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-SLOW-NEXT: movl %ebp, %eax
+; X86-SLOW-NEXT: movl %ebx, %ecx
+; X86-SLOW-NEXT: shll %cl, %eax
+; X86-SLOW-NEXT: movb %bl, %ch
+; X86-SLOW-NEXT: andb $31, %ch
+; X86-SLOW-NEXT: movb %ch, %cl
+; X86-SLOW-NEXT: negb %cl
+; X86-SLOW-NEXT: shrl %cl, %edi
+; X86-SLOW-NEXT: testb %ch, %ch
+; X86-SLOW-NEXT: je .LBB3_4
+; X86-SLOW-NEXT: # %bb.3:
+; X86-SLOW-NEXT: orl %edi, %eax
+; X86-SLOW-NEXT: movl %eax, %ebp
+; X86-SLOW-NEXT: .LBB3_4:
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT: movl %eax, %edi
+; X86-SLOW-NEXT: movl %ebx, %ecx
+; X86-SLOW-NEXT: shll %cl, %edi
+; X86-SLOW-NEXT: testb $32, %bl
+; X86-SLOW-NEXT: je .LBB3_6
+; X86-SLOW-NEXT: # %bb.5:
+; X86-SLOW-NEXT: movl %edi, %ebp
+; X86-SLOW-NEXT: xorl %edi, %edi
+; X86-SLOW-NEXT: .LBB3_6:
+; X86-SLOW-NEXT: movb %dh, %cl
+; X86-SLOW-NEXT: shrl %cl, %esi
+; X86-SLOW-NEXT: testb $32, %dh
+; X86-SLOW-NEXT: jne .LBB3_7
+; X86-SLOW-NEXT: # %bb.8:
+; X86-SLOW-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-SLOW-NEXT: testl %ebx, %ebx
+; X86-SLOW-NEXT: jne .LBB3_10
+; X86-SLOW-NEXT: jmp .LBB3_11
+; X86-SLOW-NEXT: .LBB3_7:
+; X86-SLOW-NEXT: movl %esi, %ecx
+; X86-SLOW-NEXT: xorl %esi, %esi
+; X86-SLOW-NEXT: testl %ebx, %ebx
+; X86-SLOW-NEXT: je .LBB3_11
+; X86-SLOW-NEXT: .LBB3_10:
+; X86-SLOW-NEXT: orl %esi, %ebp
+; X86-SLOW-NEXT: orl %ecx, %edi
+; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT: movl %edi, %eax
+; X86-SLOW-NEXT: .LBB3_11:
+; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SLOW-NEXT: addl $8, %esp
+; X86-SLOW-NEXT: popl %esi
+; X86-SLOW-NEXT: popl %edi
+; X86-SLOW-NEXT: popl %ebx
+; X86-SLOW-NEXT: popl %ebp
+; X86-SLOW-NEXT: retl
;
; X64-FAST-LABEL: var_shift_i64:
; X64-FAST: # %bb.0:
; X64-FAST-NEXT: movq %rdx, %rcx
-; X64-FAST-NEXT: andl $63, %ecx
; X64-FAST-NEXT: movq %rdi, %rax
+; X64-FAST-NEXT: # kill: def $cl killed $cl killed $rcx
; X64-FAST-NEXT: shldq %cl, %rsi, %rax
-; X64-FAST-NEXT: testq %rcx, %rcx
-; X64-FAST-NEXT: cmoveq %rdi, %rax
; X64-FAST-NEXT: retq
;
; X64-SLOW-LABEL: var_shift_i64:
; X64-SLOW: # %bb.0:
; X64-SLOW-NEXT: movq %rsi, %rax
-; X64-SLOW-NEXT: andl $63, %edx
; X64-SLOW-NEXT: movq %rdi, %rsi
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: shlq %cl, %rsi
+; X64-SLOW-NEXT: andb $63, %dl
; X64-SLOW-NEXT: movl %edx, %ecx
-; X64-SLOW-NEXT: negl %ecx
-; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-SLOW-NEXT: negb %cl
; X64-SLOW-NEXT: shrq %cl, %rax
; X64-SLOW-NEXT: orq %rsi, %rax
-; X64-SLOW-NEXT: testq %rdx, %rdx
+; X64-SLOW-NEXT: testb %dl, %dl
; X64-SLOW-NEXT: cmoveq %rdi, %rax
; X64-SLOW-NEXT: retq
%tmp = tail call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 %z)
Modified: llvm/trunk/test/CodeGen/X86/fshr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fshr.ll?rev=348353&r1=348352&r2=348353&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fshr.ll (original)
+++ llvm/trunk/test/CodeGen/X86/fshr.ll Wed Dec 5 03:12:12 2018
@@ -58,20 +58,11 @@ define i8 @var_shift_i8(i8 %x, i8 %y, i8
define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
; X86-FAST-LABEL: var_shift_i16:
; X86-FAST: # %bb.0:
-; X86-FAST-NEXT: pushl %esi
-; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %esi
-; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
-; X86-FAST-NEXT: andl $15, %ecx
-; X86-FAST-NEXT: movl %eax, %edx
-; X86-FAST-NEXT: shrdw %cl, %si, %dx
-; X86-FAST-NEXT: testw %cx, %cx
-; X86-FAST-NEXT: je .LBB1_2
-; X86-FAST-NEXT: # %bb.1:
-; X86-FAST-NEXT: movl %edx, %eax
-; X86-FAST-NEXT: .LBB1_2:
-; X86-FAST-NEXT: # kill: def $ax killed $ax killed $eax
-; X86-FAST-NEXT: popl %esi
+; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-FAST-NEXT: andb $15, %cl
+; X86-FAST-NEXT: shrdw %cl, %dx, %ax
; X86-FAST-NEXT: retl
;
; X86-SLOW-LABEL: var_shift_i16:
@@ -79,17 +70,16 @@ define i16 @var_shift_i16(i16 %x, i16 %y
; X86-SLOW-NEXT: pushl %edi
; X86-SLOW-NEXT: pushl %esi
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT: andl $15, %edx
+; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl
+; X86-SLOW-NEXT: andb $15, %dl
; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: movl %eax, %edi
; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: shrl %cl, %edi
-; X86-SLOW-NEXT: movl $16, %ecx
-; X86-SLOW-NEXT: subl %edx, %ecx
-; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT: movb $16, %cl
+; X86-SLOW-NEXT: subb %dl, %cl
; X86-SLOW-NEXT: shll %cl, %esi
-; X86-SLOW-NEXT: testw %dx, %dx
+; X86-SLOW-NEXT: testb %dl, %dl
; X86-SLOW-NEXT: je .LBB1_2
; X86-SLOW-NEXT: # %bb.1:
; X86-SLOW-NEXT: orl %edi, %esi
@@ -103,26 +93,24 @@ define i16 @var_shift_i16(i16 %x, i16 %y
; X64-FAST-LABEL: var_shift_i16:
; X64-FAST: # %bb.0:
; X64-FAST-NEXT: movl %edx, %ecx
-; X64-FAST-NEXT: andl $15, %ecx
; X64-FAST-NEXT: movl %esi, %eax
+; X64-FAST-NEXT: andb $15, %cl
+; X64-FAST-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-FAST-NEXT: shrdw %cl, %di, %ax
-; X64-FAST-NEXT: testw %cx, %cx
-; X64-FAST-NEXT: cmovel %esi, %eax
; X64-FAST-NEXT: # kill: def $ax killed $ax killed $eax
; X64-FAST-NEXT: retq
;
; X64-SLOW-LABEL: var_shift_i16:
; X64-SLOW: # %bb.0:
; X64-SLOW-NEXT: movzwl %si, %eax
-; X64-SLOW-NEXT: andl $15, %edx
+; X64-SLOW-NEXT: andb $15, %dl
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: shrl %cl, %eax
-; X64-SLOW-NEXT: movl $16, %ecx
-; X64-SLOW-NEXT: subl %edx, %ecx
-; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-SLOW-NEXT: movb $16, %cl
+; X64-SLOW-NEXT: subb %dl, %cl
; X64-SLOW-NEXT: shll %cl, %edi
; X64-SLOW-NEXT: orl %edi, %eax
-; X64-SLOW-NEXT: testw %dx, %dx
+; X64-SLOW-NEXT: testb %dl, %dl
; X64-SLOW-NEXT: cmovel %esi, %eax
; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
; X64-SLOW-NEXT: retq
@@ -133,19 +121,10 @@ define i16 @var_shift_i16(i16 %x, i16 %y
define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
; X86-FAST-LABEL: var_shift_i32:
; X86-FAST: # %bb.0:
-; X86-FAST-NEXT: pushl %esi
-; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-FAST-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-FAST-NEXT: andl $31, %ecx
-; X86-FAST-NEXT: movl %eax, %edx
-; X86-FAST-NEXT: shrdl %cl, %esi, %edx
-; X86-FAST-NEXT: testl %ecx, %ecx
-; X86-FAST-NEXT: je .LBB2_2
-; X86-FAST-NEXT: # %bb.1:
-; X86-FAST-NEXT: movl %edx, %eax
-; X86-FAST-NEXT: .LBB2_2:
-; X86-FAST-NEXT: popl %esi
+; X86-FAST-NEXT: shrdl %cl, %edx, %eax
; X86-FAST-NEXT: retl
;
; X86-SLOW-LABEL: var_shift_i32:
@@ -153,17 +132,16 @@ define i32 @var_shift_i32(i32 %x, i32 %y
; X86-SLOW-NEXT: pushl %edi
; X86-SLOW-NEXT: pushl %esi
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT: andl $31, %edx
; X86-SLOW-NEXT: movl %eax, %edi
; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: shrl %cl, %edi
+; X86-SLOW-NEXT: andb $31, %dl
; X86-SLOW-NEXT: movl %edx, %ecx
-; X86-SLOW-NEXT: negl %ecx
-; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT: negb %cl
; X86-SLOW-NEXT: shll %cl, %esi
-; X86-SLOW-NEXT: testl %edx, %edx
+; X86-SLOW-NEXT: testb %dl, %dl
; X86-SLOW-NEXT: je .LBB2_2
; X86-SLOW-NEXT: # %bb.1:
; X86-SLOW-NEXT: orl %edi, %esi
@@ -176,26 +154,23 @@ define i32 @var_shift_i32(i32 %x, i32 %y
; X64-FAST-LABEL: var_shift_i32:
; X64-FAST: # %bb.0:
; X64-FAST-NEXT: movl %edx, %ecx
-; X64-FAST-NEXT: andl $31, %ecx
; X64-FAST-NEXT: movl %esi, %eax
+; X64-FAST-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-FAST-NEXT: shrdl %cl, %edi, %eax
-; X64-FAST-NEXT: testl %ecx, %ecx
-; X64-FAST-NEXT: cmovel %esi, %eax
; X64-FAST-NEXT: retq
;
; X64-SLOW-LABEL: var_shift_i32:
; X64-SLOW: # %bb.0:
; X64-SLOW-NEXT: movl %edi, %eax
-; X64-SLOW-NEXT: andl $31, %edx
; X64-SLOW-NEXT: movl %esi, %edi
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: shrl %cl, %edi
+; X64-SLOW-NEXT: andb $31, %dl
; X64-SLOW-NEXT: movl %edx, %ecx
-; X64-SLOW-NEXT: negl %ecx
-; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-SLOW-NEXT: negb %cl
; X64-SLOW-NEXT: shll %cl, %eax
; X64-SLOW-NEXT: orl %edi, %eax
-; X64-SLOW-NEXT: testl %edx, %edx
+; X64-SLOW-NEXT: testb %dl, %dl
; X64-SLOW-NEXT: cmovel %esi, %eax
; X64-SLOW-NEXT: retq
%tmp = tail call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
@@ -203,81 +178,164 @@ define i32 @var_shift_i32(i32 %x, i32 %y
}
define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
-; X86-LABEL: var_shift_i64:
-; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: pushl %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: andl $63, %ebx
-; X86-NEXT: movb $64, %cl
-; X86-NEXT: subb %bl, %cl
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: testb $32, %cl
-; X86-NEXT: je .LBB3_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %edi, %esi
-; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: .LBB3_2:
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %ebx, %ecx
-; X86-NEXT: shrl %cl, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrdl %cl, %edx, %eax
-; X86-NEXT: testb $32, %bl
-; X86-NEXT: je .LBB3_4
-; X86-NEXT: # %bb.3:
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: xorl %ebp, %ebp
-; X86-NEXT: .LBB3_4:
-; X86-NEXT: testl %ebx, %ebx
-; X86-NEXT: je .LBB3_6
-; X86-NEXT: # %bb.5:
-; X86-NEXT: orl %ebp, %esi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: .LBB3_6:
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: addl $4, %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl
+; X86-FAST-LABEL: var_shift_i64:
+; X86-FAST: # %bb.0:
+; X86-FAST-NEXT: pushl %ebp
+; X86-FAST-NEXT: pushl %ebx
+; X86-FAST-NEXT: pushl %edi
+; X86-FAST-NEXT: pushl %esi
+; X86-FAST-NEXT: pushl %eax
+; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-FAST-NEXT: andl $63, %ebx
+; X86-FAST-NEXT: movb $64, %cl
+; X86-FAST-NEXT: subb %bl, %cl
+; X86-FAST-NEXT: movl %eax, %edi
+; X86-FAST-NEXT: shll %cl, %edi
+; X86-FAST-NEXT: shldl %cl, %eax, %esi
+; X86-FAST-NEXT: testb $32, %cl
+; X86-FAST-NEXT: je .LBB3_2
+; X86-FAST-NEXT: # %bb.1:
+; X86-FAST-NEXT: movl %edi, %esi
+; X86-FAST-NEXT: xorl %edi, %edi
+; X86-FAST-NEXT: .LBB3_2:
+; X86-FAST-NEXT: movl %edx, %ebp
+; X86-FAST-NEXT: movl %ebx, %ecx
+; X86-FAST-NEXT: shrl %cl, %ebp
+; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT: shrdl %cl, %edx, %eax
+; X86-FAST-NEXT: testb $32, %bl
+; X86-FAST-NEXT: je .LBB3_4
+; X86-FAST-NEXT: # %bb.3:
+; X86-FAST-NEXT: movl %ebp, %eax
+; X86-FAST-NEXT: xorl %ebp, %ebp
+; X86-FAST-NEXT: .LBB3_4:
+; X86-FAST-NEXT: testl %ebx, %ebx
+; X86-FAST-NEXT: je .LBB3_6
+; X86-FAST-NEXT: # %bb.5:
+; X86-FAST-NEXT: orl %ebp, %esi
+; X86-FAST-NEXT: orl %eax, %edi
+; X86-FAST-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-FAST-NEXT: movl %esi, %edx
+; X86-FAST-NEXT: .LBB3_6:
+; X86-FAST-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-FAST-NEXT: addl $4, %esp
+; X86-FAST-NEXT: popl %esi
+; X86-FAST-NEXT: popl %edi
+; X86-FAST-NEXT: popl %ebx
+; X86-FAST-NEXT: popl %ebp
+; X86-FAST-NEXT: retl
+;
+; X86-SLOW-LABEL: var_shift_i64:
+; X86-SLOW: # %bb.0:
+; X86-SLOW-NEXT: pushl %ebp
+; X86-SLOW-NEXT: pushl %ebx
+; X86-SLOW-NEXT: pushl %edi
+; X86-SLOW-NEXT: pushl %esi
+; X86-SLOW-NEXT: subl $8, %esp
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-SLOW-NEXT: andl $63, %ebx
+; X86-SLOW-NEXT: movb $64, %al
+; X86-SLOW-NEXT: subb %bl, %al
+; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT: movl %eax, %ecx
+; X86-SLOW-NEXT: shll %cl, %edx
+; X86-SLOW-NEXT: movb %al, %ch
+; X86-SLOW-NEXT: andb $31, %ch
+; X86-SLOW-NEXT: movb %ch, %cl
+; X86-SLOW-NEXT: negb %cl
+; X86-SLOW-NEXT: movl %esi, %edi
+; X86-SLOW-NEXT: shrl %cl, %edi
+; X86-SLOW-NEXT: testb %ch, %ch
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-SLOW-NEXT: je .LBB3_2
+; X86-SLOW-NEXT: # %bb.1:
+; X86-SLOW-NEXT: orl %edi, %edx
+; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT: .LBB3_2:
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT: movl %ecx, %edx
+; X86-SLOW-NEXT: movl %ebx, %ecx
+; X86-SLOW-NEXT: shrl %cl, %edx
+; X86-SLOW-NEXT: movb %bl, %ah
+; X86-SLOW-NEXT: andb $31, %ah
+; X86-SLOW-NEXT: movb %ah, %cl
+; X86-SLOW-NEXT: negb %cl
+; X86-SLOW-NEXT: movl %ebp, %edi
+; X86-SLOW-NEXT: shll %cl, %edi
+; X86-SLOW-NEXT: testb %ah, %ah
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-SLOW-NEXT: je .LBB3_4
+; X86-SLOW-NEXT: # %bb.3:
+; X86-SLOW-NEXT: orl %edx, %edi
+; X86-SLOW-NEXT: movl %edi, %ebp
+; X86-SLOW-NEXT: .LBB3_4:
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-SLOW-NEXT: movl %ebx, %ecx
+; X86-SLOW-NEXT: shrl %cl, %edi
+; X86-SLOW-NEXT: testb $32, %bl
+; X86-SLOW-NEXT: je .LBB3_6
+; X86-SLOW-NEXT: # %bb.5:
+; X86-SLOW-NEXT: movl %edi, %ebp
+; X86-SLOW-NEXT: xorl %edi, %edi
+; X86-SLOW-NEXT: .LBB3_6:
+; X86-SLOW-NEXT: movl %eax, %ecx
+; X86-SLOW-NEXT: shll %cl, %esi
+; X86-SLOW-NEXT: testb $32, %al
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT: jne .LBB3_7
+; X86-SLOW-NEXT: # %bb.8:
+; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-SLOW-NEXT: testl %ebx, %ebx
+; X86-SLOW-NEXT: jne .LBB3_10
+; X86-SLOW-NEXT: jmp .LBB3_11
+; X86-SLOW-NEXT: .LBB3_7:
+; X86-SLOW-NEXT: movl %esi, %eax
+; X86-SLOW-NEXT: xorl %esi, %esi
+; X86-SLOW-NEXT: testl %ebx, %ebx
+; X86-SLOW-NEXT: je .LBB3_11
+; X86-SLOW-NEXT: .LBB3_10:
+; X86-SLOW-NEXT: orl %ebp, %esi
+; X86-SLOW-NEXT: orl %edi, %eax
+; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT: movl %eax, %edx
+; X86-SLOW-NEXT: .LBB3_11:
+; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SLOW-NEXT: addl $8, %esp
+; X86-SLOW-NEXT: popl %esi
+; X86-SLOW-NEXT: popl %edi
+; X86-SLOW-NEXT: popl %ebx
+; X86-SLOW-NEXT: popl %ebp
+; X86-SLOW-NEXT: retl
;
; X64-FAST-LABEL: var_shift_i64:
; X64-FAST: # %bb.0:
; X64-FAST-NEXT: movq %rdx, %rcx
-; X64-FAST-NEXT: andl $63, %ecx
; X64-FAST-NEXT: movq %rsi, %rax
+; X64-FAST-NEXT: # kill: def $cl killed $cl killed $rcx
; X64-FAST-NEXT: shrdq %cl, %rdi, %rax
-; X64-FAST-NEXT: testq %rcx, %rcx
-; X64-FAST-NEXT: cmoveq %rsi, %rax
; X64-FAST-NEXT: retq
;
; X64-SLOW-LABEL: var_shift_i64:
; X64-SLOW: # %bb.0:
; X64-SLOW-NEXT: movq %rdi, %rax
-; X64-SLOW-NEXT: andl $63, %edx
; X64-SLOW-NEXT: movq %rsi, %rdi
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: shrq %cl, %rdi
+; X64-SLOW-NEXT: andb $63, %dl
; X64-SLOW-NEXT: movl %edx, %ecx
-; X64-SLOW-NEXT: negl %ecx
-; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-SLOW-NEXT: negb %cl
; X64-SLOW-NEXT: shlq %cl, %rax
; X64-SLOW-NEXT: orq %rdi, %rax
-; X64-SLOW-NEXT: testq %rdx, %rdx
+; X64-SLOW-NEXT: testb %dl, %dl
; X64-SLOW-NEXT: cmoveq %rsi, %rax
; X64-SLOW-NEXT: retq
%tmp = tail call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 %z)
@@ -315,7 +373,7 @@ define i16 @const_shift_i16(i16 %x, i16
; X86-FAST: # %bb.0:
; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-FAST-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT: shldw $9, %cx, %ax
+; X86-FAST-NEXT: shrdw $7, %cx, %ax
; X86-FAST-NEXT: retl
;
; X86-SLOW-LABEL: const_shift_i16:
@@ -330,8 +388,8 @@ define i16 @const_shift_i16(i16 %x, i16
;
; X64-FAST-LABEL: const_shift_i16:
; X64-FAST: # %bb.0:
-; X64-FAST-NEXT: movl %edi, %eax
-; X64-FAST-NEXT: shldw $9, %si, %ax
+; X64-FAST-NEXT: movl %esi, %eax
+; X64-FAST-NEXT: shrdw $7, %di, %ax
; X64-FAST-NEXT: # kill: def $ax killed $ax killed $eax
; X64-FAST-NEXT: retq
;
@@ -352,7 +410,7 @@ define i32 @const_shift_i32(i32 %x, i32
; X86-FAST: # %bb.0:
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT: shldl $25, %ecx, %eax
+; X86-FAST-NEXT: shrdl $7, %ecx, %eax
; X86-FAST-NEXT: retl
;
; X86-SLOW-LABEL: const_shift_i32:
Modified: llvm/trunk/test/CodeGen/X86/funnel-shift.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/funnel-shift.ll?rev=348353&r1=348352&r2=348353&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/funnel-shift.ll (original)
+++ llvm/trunk/test/CodeGen/X86/funnel-shift.ll Wed Dec 5 03:12:12 2018
@@ -14,31 +14,23 @@ declare i32 @llvm.fshr.i32(i32, i32, i32
declare i64 @llvm.fshr.i64(i64, i64, i64)
declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
-; General case - all operands can be variables - x86 has shld, but the mask and cmov are not needed?
+; General case - all operands can be variables
define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) nounwind {
; X32-SSE2-LABEL: fshl_i32:
; X32-SSE2: # %bb.0:
-; X32-SSE2-NEXT: pushl %esi
+; X32-SSE2-NEXT: movb {{[0-9]+}}(%esp), %cl
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-SSE2-NEXT: andl $31, %ecx
-; X32-SSE2-NEXT: movl %esi, %eax
+; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE2-NEXT: shldl %cl, %edx, %eax
-; X32-SSE2-NEXT: testl %ecx, %ecx
-; X32-SSE2-NEXT: cmovel %esi, %eax
-; X32-SSE2-NEXT: popl %esi
; X32-SSE2-NEXT: retl
;
; X64-AVX2-LABEL: fshl_i32:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: movl %edx, %ecx
-; X64-AVX2-NEXT: andl $31, %ecx
; X64-AVX2-NEXT: movl %edi, %eax
+; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-AVX2-NEXT: shldl %cl, %esi, %eax
-; X64-AVX2-NEXT: testl %ecx, %ecx
-; X64-AVX2-NEXT: cmovel %edi, %eax
; X64-AVX2-NEXT: retq
%f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
ret i32 %f
@@ -212,31 +204,23 @@ define i8 @fshl_i8_const_fold() nounwind
; Repeat everything for funnel shift right.
-; General case - all operands can be variables - x86 has 'shrd', but the mask and cmov are not needed?
+; General case - all operands can be variables
define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) nounwind {
; X32-SSE2-LABEL: fshr_i32:
; X32-SSE2: # %bb.0:
-; X32-SSE2-NEXT: pushl %esi
+; X32-SSE2-NEXT: movb {{[0-9]+}}(%esp), %cl
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-SSE2-NEXT: andl $31, %ecx
-; X32-SSE2-NEXT: movl %esi, %eax
+; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE2-NEXT: shrdl %cl, %edx, %eax
-; X32-SSE2-NEXT: testl %ecx, %ecx
-; X32-SSE2-NEXT: cmovel %esi, %eax
-; X32-SSE2-NEXT: popl %esi
; X32-SSE2-NEXT: retl
;
; X64-AVX2-LABEL: fshr_i32:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: movl %edx, %ecx
-; X64-AVX2-NEXT: andl $31, %ecx
; X64-AVX2-NEXT: movl %esi, %eax
+; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-AVX2-NEXT: shrdl %cl, %edi, %eax
-; X64-AVX2-NEXT: testl %ecx, %ecx
-; X64-AVX2-NEXT: cmovel %esi, %eax
; X64-AVX2-NEXT: retq
%f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
ret i32 %f
@@ -341,7 +325,7 @@ define i32 @fshr_i32_const_shift(i32 %x,
; X32-SSE2: # %bb.0:
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: shldl $23, %ecx, %eax
+; X32-SSE2-NEXT: shrdl $9, %ecx, %eax
; X32-SSE2-NEXT: retl
;
; X64-AVX2-LABEL: fshr_i32_const_shift:
@@ -353,14 +337,14 @@ define i32 @fshr_i32_const_shift(i32 %x,
ret i32 %f
}
-; Check modulo math on shift amount. 41-32=9, but right-shift became left, so 32-9=23.
+; Check modulo math on shift amount. 41-32=9, but right-shift may became left, so 32-9=23.
define i32 @fshr_i32_const_overshift(i32 %x, i32 %y) nounwind {
; X32-SSE2-LABEL: fshr_i32_const_overshift:
; X32-SSE2: # %bb.0:
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: shldl $23, %ecx, %eax
+; X32-SSE2-NEXT: shrdl $9, %ecx, %eax
; X32-SSE2-NEXT: retl
;
; X64-AVX2-LABEL: fshr_i32_const_overshift:
More information about the llvm-commits
mailing list