[llvm] 6e561d1 - [Intrinsic] Add fixed point saturating division intrinsics.
Bevin Hansson via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 24 01:56:01 PST 2020
Author: Bevin Hansson
Date: 2020-02-24T10:50:52+01:00
New Revision: 6e561d1c94edc2ecaab7b79f6b3f1a06f515d531
URL: https://github.com/llvm/llvm-project/commit/6e561d1c94edc2ecaab7b79f6b3f1a06f515d531
DIFF: https://github.com/llvm/llvm-project/commit/6e561d1c94edc2ecaab7b79f6b3f1a06f515d531.diff
LOG: [Intrinsic] Add fixed point saturating division intrinsics.
Summary:
This patch adds intrinsics and ISelDAG nodes for signed
and unsigned fixed-point division:
```
llvm.sdiv.fix.sat.*
llvm.udiv.fix.sat.*
```
These intrinsics perform scaled, saturating division
on two integers or vectors of integers. They are
required for the implementation of the Embedded-C
fixed-point arithmetic in Clang.
Reviewers: bjope, leonardchan, craig.topper
Subscribers: hiraditya, jdoerfert, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D71550
Added:
llvm/test/CodeGen/X86/sdiv_fix_sat.ll
llvm/test/CodeGen/X86/udiv_fix_sat.ll
Modified:
llvm/docs/LangRef.rst
llvm/include/llvm/CodeGen/ISDOpcodes.h
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/include/llvm/IR/Intrinsics.td
llvm/include/llvm/Target/TargetSelectionDAG.td
llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/lib/CodeGen/TargetLoweringBase.cpp
llvm/lib/IR/Verifier.cpp
Removed:
################################################################################
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index bb02c4b02cf4..43dbc70d2fd1 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -14331,6 +14331,136 @@ Examples
%res = call i4 @llvm.udiv.fix.i4(i4 3, i4 4, i32 1) ; %res = 2 (or 1) (1.5 / 2 = 0.75)
+'``llvm.sdiv.fix.sat.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.sdiv.fix.sat``
+on any integer bit width or vectors of integers.
+
+::
+
+ declare i16 @llvm.sdiv.fix.sat.i16(i16 %a, i16 %b, i32 %scale)
+ declare i32 @llvm.sdiv.fix.sat.i32(i32 %a, i32 %b, i32 %scale)
+ declare i64 @llvm.sdiv.fix.sat.i64(i64 %a, i64 %b, i32 %scale)
+ declare <4 x i32> @llvm.sdiv.fix.sat.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %scale)
+
+Overview
+"""""""""
+
+The '``llvm.sdiv.fix.sat``' family of intrinsic functions perform signed
+fixed point saturation division on 2 arguments of the same scale.
+
+Arguments
+""""""""""
+
+The arguments (%a and %b) and the result may be of integer types of any bit
+width, but they must have the same bit width. ``%a`` and ``%b`` are the two
+values that will undergo signed fixed point division. The argument
+``%scale`` represents the scale of both operands, and must be a constant
+integer.
+
+Semantics:
+""""""""""
+
+This operation performs fixed point division on the 2 arguments of a
+specified scale. The result will also be returned in the same scale specified
+in the third argument.
+
+If the result value cannot be precisely represented in the given scale, the
+value is rounded up or down to the closest representable value. The rounding
+direction is unspecified.
+
+The maximum value this operation can clamp to is the largest signed value
+representable by the bit width of the first 2 arguments. The minimum value is the
+smallest signed value representable by this bit width.
+
+It is undefined behavior if the second argument is zero.
+
+
+Examples
+"""""""""
+
+.. code-block:: llvm
+
+ %res = call i4 @llvm.sdiv.fix.sat.i4(i4 6, i4 2, i32 0) ; %res = 3 (6 / 2 = 3)
+ %res = call i4 @llvm.sdiv.fix.sat.i4(i4 6, i4 4, i32 1) ; %res = 3 (3 / 2 = 1.5)
+ %res = call i4 @llvm.sdiv.fix.sat.i4(i4 3, i4 -2, i32 1) ; %res = -3 (1.5 / -1 = -1.5)
+
+ ; The result in the following could be rounded up to 1 or down to 0.5
+ %res = call i4 @llvm.sdiv.fix.sat.i4(i4 3, i4 4, i32 1) ; %res = 2 (or 1) (1.5 / 2 = 0.75)
+
+ ; Saturation
+ %res = call i4 @llvm.sdiv.fix.sat.i4(i4 -8, i4 -1, i32 0) ; %res = 7 (-8 / -1 = 8 => 7)
+ %res = call i4 @llvm.sdiv.fix.sat.i4(i4 4, i4 2, i32 2) ; %res = 7 (1 / 0.5 = 2 => 1.75)
+ %res = call i4 @llvm.sdiv.fix.sat.i4(i4 -4, i4 1, i32 2) ; %res = -8 (-1 / 0.25 = -4 => -2)
+
+
+'``llvm.udiv.fix.sat.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.udiv.fix.sat``
+on any integer bit width or vectors of integers.
+
+::
+
+ declare i16 @llvm.udiv.fix.sat.i16(i16 %a, i16 %b, i32 %scale)
+ declare i32 @llvm.udiv.fix.sat.i32(i32 %a, i32 %b, i32 %scale)
+ declare i64 @llvm.udiv.fix.sat.i64(i64 %a, i64 %b, i32 %scale)
+ declare <4 x i32> @llvm.udiv.fix.sat.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %scale)
+
+Overview
+"""""""""
+
+The '``llvm.udiv.fix.sat``' family of intrinsic functions perform unsigned
+fixed point saturation division on 2 arguments of the same scale.
+
+Arguments
+""""""""""
+
+The arguments (%a and %b) and the result may be of integer types of any bit
+width, but they must have the same bit width. ``%a`` and ``%b`` are the two
+values that will undergo unsigned fixed point division. The argument
+``%scale`` represents the scale of both operands, and must be a constant
+integer.
+
+Semantics:
+""""""""""
+
+This operation performs fixed point division on the 2 arguments of a
+specified scale. The result will also be returned in the same scale specified
+in the third argument.
+
+If the result value cannot be precisely represented in the given scale, the
+value is rounded up or down to the closest representable value. The rounding
+direction is unspecified.
+
+The maximum value this operation can clamp to is the largest unsigned value
+representable by the bit width of the first 2 arguments. The minimum value is the
+smallest unsigned value representable by this bit width (zero).
+
+It is undefined behavior if the second argument is zero.
+
+Examples
+"""""""""
+
+.. code-block:: llvm
+
+ %res = call i4 @llvm.udiv.fix.sat.i4(i4 6, i4 2, i32 0) ; %res = 3 (6 / 2 = 3)
+ %res = call i4 @llvm.udiv.fix.sat.i4(i4 6, i4 4, i32 1) ; %res = 3 (3 / 2 = 1.5)
+
+ ; The result in the following could be rounded down to 0.5 or up to 1
+ %res = call i4 @llvm.udiv.fix.sat.i4(i4 3, i4 4, i32 1) ; %res = 1 (or 2) (1.5 / 2 = 0.75)
+
+ ; Saturation
+ %res = call i4 @llvm.udiv.fix.sat.i4(i4 8, i4 2, i32 2) ; %res = 15 (2 / 0.5 = 4 => 3.75)
+
+
Specialised Arithmetic Intrinsics
---------------------------------
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index afeaf5e2c26f..c5e4a375a635 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -291,6 +291,11 @@ namespace ISD {
/// constant integer.
SDIVFIX, UDIVFIX,
+ /// Same as the corresponding unsaturated fixed point instructions, but the
+ /// result is clamped between the min and max values representable by the
+ /// bits of the first 2 operands.
+ SDIVFIXSAT, UDIVFIXSAT,
+
/// Simple binary floating point operators.
FADD, FSUB, FMUL, FDIV, FREM,
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 0d6c3ab97018..9583e2b718e5 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1043,7 +1043,9 @@ class TargetLoweringBase {
case ISD::UMULFIX:
case ISD::UMULFIXSAT:
case ISD::SDIVFIX:
+ case ISD::SDIVFIXSAT:
case ISD::UDIVFIX:
+ case ISD::UDIVFIXSAT:
Supported = isSupportedFixedPointOperation(Op, VT, Scale);
break;
}
@@ -4269,7 +4271,7 @@ class TargetLowering : public TargetLoweringBase {
/// method accepts integers as its arguments.
SDValue expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const;
- /// Method for building the DAG expansion of ISD::[US]DIVFIX. This
+ /// Method for building the DAG expansion of ISD::[US]DIVFIX[SAT]. This
/// method accepts integers as its arguments.
/// Note: This method may fail if the division could not be performed
/// within the type. Clients must retry with a wider type if this happens.
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 563f8b0e4e7a..797d7b1765c3 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -969,6 +969,14 @@ def int_umul_fix_sat : Intrinsic<[llvm_anyint_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative, ImmArg<2>]>;
+def int_sdiv_fix_sat : Intrinsic<[llvm_anyint_ty],
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
+ [IntrNoMem, ImmArg<2>]>;
+
+def int_udiv_fix_sat : Intrinsic<[llvm_anyint_ty],
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
+ [IntrNoMem, ImmArg<2>]>;
+
//===------------------------- Memory Use Markers -------------------------===//
//
def int_lifetime_start : Intrinsic<[],
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 7d8b97069de7..25b77baed2e1 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -402,7 +402,9 @@ def smulfixsat : SDNode<"ISD::SMULFIXSAT", SDTIntScaledBinOp, [SDNPCommutative]>
def umulfix : SDNode<"ISD::UMULFIX" , SDTIntScaledBinOp, [SDNPCommutative]>;
def umulfixsat : SDNode<"ISD::UMULFIXSAT", SDTIntScaledBinOp, [SDNPCommutative]>;
def sdivfix : SDNode<"ISD::SDIVFIX" , SDTIntScaledBinOp>;
+def sdivfixsat : SDNode<"ISD::SDIVFIXSAT", SDTIntScaledBinOp>;
def udivfix : SDNode<"ISD::UDIVFIX" , SDTIntScaledBinOp>;
+def udivfixsat : SDNode<"ISD::UDIVFIXSAT", SDTIntScaledBinOp>;
def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>;
def sext_invec : SDNode<"ISD::SIGN_EXTEND_VECTOR_INREG", SDTExtInvec>;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 6fd048a911b4..fefca4bac201 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1132,7 +1132,9 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
case ISD::UMULFIX:
case ISD::UMULFIXSAT:
case ISD::SDIVFIX:
- case ISD::UDIVFIX: {
+ case ISD::SDIVFIXSAT:
+ case ISD::UDIVFIX:
+ case ISD::UDIVFIXSAT: {
unsigned Scale = Node->getConstantOperandVal(2);
Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
Node->getValueType(0), Scale);
@@ -3489,7 +3491,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
Results.push_back(TLI.expandFixedPointMul(Node, DAG));
break;
case ISD::SDIVFIX:
+ case ISD::SDIVFIXSAT:
case ISD::UDIVFIX:
+ case ISD::UDIVFIXSAT:
if (SDValue V = TLI.expandFixedPointDiv(Node->getOpcode(), SDLoc(Node),
Node->getOperand(0),
Node->getOperand(1),
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 8e55eaa3e3ba..36675d9459fb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -162,7 +162,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
case ISD::UMULFIXSAT: Res = PromoteIntRes_MULFIX(N); break;
case ISD::SDIVFIX:
- case ISD::UDIVFIX: Res = PromoteIntRes_DIVFIX(N); break;
+ case ISD::SDIVFIXSAT:
+ case ISD::UDIVFIX:
+ case ISD::UDIVFIXSAT: Res = PromoteIntRes_DIVFIX(N); break;
case ISD::ABS: Res = PromoteIntRes_ABS(N); break;
@@ -784,22 +786,51 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MULFIX(SDNode *N) {
N->getOperand(2));
}
+static SDValue SaturateWidenedDIVFIX(SDValue V, SDLoc &dl,
+ unsigned SatW, bool Signed,
+ const TargetLowering &TLI,
+ SelectionDAG &DAG) {
+ EVT VT = V.getValueType();
+ unsigned VTW = VT.getScalarSizeInBits();
+
+ if (!Signed) {
+ // Saturate to the unsigned maximum by getting the minimum of V and the
+ // maximum.
+ return DAG.getNode(ISD::UMIN, dl, VT, V,
+ DAG.getConstant(APInt::getLowBitsSet(VTW, SatW),
+ dl, VT));
+ }
+
+ // Saturate to the signed maximum (the low SatW - 1 bits) by taking the
+ // signed minimum of it and V.
+ V = DAG.getNode(ISD::SMIN, dl, VT, V,
+ DAG.getConstant(APInt::getLowBitsSet(VTW, SatW - 1),
+ dl, VT));
+ // Saturate to the signed minimum (the high SatW + 1 bits) by taking the
+ // signed maximum of it and V.
+ V = DAG.getNode(ISD::SMAX, dl, VT, V,
+ DAG.getConstant(APInt::getHighBitsSet(VTW, VTW - SatW + 1),
+ dl, VT));
+ return V;
+}
+
static SDValue earlyExpandDIVFIX(SDNode *N, SDValue LHS, SDValue RHS,
- unsigned Scale, const TargetLowering &TLI,
- SelectionDAG &DAG) {
+ unsigned Scale, const TargetLowering &TLI,
+ SelectionDAG &DAG, unsigned SatW = 0) {
EVT VT = LHS.getValueType();
- bool Signed = N->getOpcode() == ISD::SDIVFIX;
+ unsigned VTSize = VT.getScalarSizeInBits();
+ bool Signed = N->getOpcode() == ISD::SDIVFIX ||
+ N->getOpcode() == ISD::SDIVFIXSAT;
+ bool Saturating = N->getOpcode() == ISD::SDIVFIXSAT ||
+ N->getOpcode() == ISD::UDIVFIXSAT;
SDLoc dl(N);
- // See if we can perform the division in this type without widening.
- if (SDValue V = TLI.expandFixedPointDiv(N->getOpcode(), dl, LHS, RHS, Scale,
- DAG))
- return V;
-
- // If that didn't work, double the type width and try again. That must work,
- // or something is wrong.
- EVT WideVT = EVT::getIntegerVT(*DAG.getContext(),
- VT.getScalarSizeInBits() * 2);
+ // Widen the types by a factor of two. This is guaranteed to expand, since it
+ // will always have enough high bits in the LHS to shift into.
+ EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VTSize * 2);
+ if (VT.isVector())
+ WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT,
+ VT.getVectorElementCount());
if (Signed) {
LHS = DAG.getSExtOrTrunc(LHS, dl, WideVT);
RHS = DAG.getSExtOrTrunc(RHS, dl, WideVT);
@@ -808,18 +839,28 @@ static SDValue earlyExpandDIVFIX(SDNode *N, SDValue LHS, SDValue RHS,
RHS = DAG.getZExtOrTrunc(RHS, dl, WideVT);
}
- // TODO: Saturation.
-
SDValue Res = TLI.expandFixedPointDiv(N->getOpcode(), dl, LHS, RHS, Scale,
DAG);
assert(Res && "Expanding DIVFIX with wide type failed?");
+ if (Saturating) {
+ // If the caller has told us to saturate at something less, use that width
+ // instead of the type before doubling. However, it cannot be more than
+ // what we just widened!
+ assert(SatW <= VTSize &&
+ "Tried to saturate to more than the original type?");
+ Res = SaturateWidenedDIVFIX(Res, dl, SatW == 0 ? VTSize : SatW, Signed,
+ TLI, DAG);
+ }
return DAG.getZExtOrTrunc(Res, dl, VT);
}
SDValue DAGTypeLegalizer::PromoteIntRes_DIVFIX(SDNode *N) {
SDLoc dl(N);
SDValue Op1Promoted, Op2Promoted;
- bool Signed = N->getOpcode() == ISD::SDIVFIX;
+ bool Signed = N->getOpcode() == ISD::SDIVFIX ||
+ N->getOpcode() == ISD::SDIVFIXSAT;
+ bool Saturating = N->getOpcode() == ISD::SDIVFIXSAT ||
+ N->getOpcode() == ISD::UDIVFIXSAT;
if (Signed) {
Op1Promoted = SExtPromotedInteger(N->getOperand(0));
Op2Promoted = SExtPromotedInteger(N->getOperand(1));
@@ -830,23 +871,41 @@ SDValue DAGTypeLegalizer::PromoteIntRes_DIVFIX(SDNode *N) {
EVT PromotedType = Op1Promoted.getValueType();
unsigned Scale = N->getConstantOperandVal(2);
- SDValue Res;
// If the type is already legal and the operation is legal in that type, we
// should not early expand.
if (TLI.isTypeLegal(PromotedType)) {
TargetLowering::LegalizeAction Action =
TLI.getFixedPointOperationAction(N->getOpcode(), PromotedType, Scale);
- if (Action == TargetLowering::Legal || Action == TargetLowering::Custom)
- Res = DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted,
- Op2Promoted, N->getOperand(2));
+ if (Action == TargetLowering::Legal || Action == TargetLowering::Custom) {
+ EVT ShiftTy = TLI.getShiftAmountTy(PromotedType, DAG.getDataLayout());
+ unsigned Diff = PromotedType.getScalarSizeInBits() -
+ N->getValueType(0).getScalarSizeInBits();
+ if (Saturating)
+ Op1Promoted = DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted,
+ DAG.getConstant(Diff, dl, ShiftTy));
+ SDValue Res = DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted,
+ Op2Promoted, N->getOperand(2));
+ if (Saturating)
+ Res = DAG.getNode(Signed ? ISD::SRA : ISD::SRL, dl, PromotedType, Res,
+ DAG.getConstant(Diff, dl, ShiftTy));
+ return Res;
+ }
}
- if (!Res)
- Res = earlyExpandDIVFIX(N, Op1Promoted, Op2Promoted, Scale, TLI, DAG);
-
- // TODO: Saturation.
-
- return Res;
+ // See if we can perform the division in this type without expanding.
+ if (SDValue Res = TLI.expandFixedPointDiv(N->getOpcode(), dl, Op1Promoted,
+ Op2Promoted, Scale, DAG)) {
+ if (Saturating)
+ Res = SaturateWidenedDIVFIX(Res, dl,
+ N->getValueType(0).getScalarSizeInBits(),
+ Signed, TLI, DAG);
+ return Res;
+ }
+ // If we cannot, expand it to twice the type width. If we are saturating, give
+ // it the original width as a saturating width so we don't need to emit
+ // two saturations.
+ return earlyExpandDIVFIX(N, Op1Promoted, Op2Promoted, Scale, TLI, DAG,
+ N->getValueType(0).getScalarSizeInBits());
}
SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) {
@@ -1315,7 +1374,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
case ISD::UMULFIX:
case ISD::UMULFIXSAT:
case ISD::SDIVFIX:
- case ISD::UDIVFIX: Res = PromoteIntOp_FIX(N); break;
+ case ISD::SDIVFIXSAT:
+ case ISD::UDIVFIX:
+ case ISD::UDIVFIXSAT: Res = PromoteIntOp_FIX(N); break;
case ISD::FPOWI: Res = PromoteIntOp_FPOWI(N); break;
@@ -1923,7 +1984,9 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
case ISD::UMULFIXSAT: ExpandIntRes_MULFIX(N, Lo, Hi); break;
case ISD::SDIVFIX:
- case ISD::UDIVFIX: ExpandIntRes_DIVFIX(N, Lo, Hi); break;
+ case ISD::SDIVFIXSAT:
+ case ISD::UDIVFIX:
+ case ISD::UDIVFIXSAT: ExpandIntRes_DIVFIX(N, Lo, Hi); break;
case ISD::VECREDUCE_ADD:
case ISD::VECREDUCE_MUL:
@@ -3253,8 +3316,15 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
void DAGTypeLegalizer::ExpandIntRes_DIVFIX(SDNode *N, SDValue &Lo,
SDValue &Hi) {
- SDValue Res = earlyExpandDIVFIX(N, N->getOperand(0), N->getOperand(1),
- N->getConstantOperandVal(2), TLI, DAG);
+ SDLoc dl(N);
+ // Try expanding in the existing type first.
+ SDValue Res = TLI.expandFixedPointDiv(N->getOpcode(), dl, N->getOperand(0),
+ N->getOperand(1),
+ N->getConstantOperandVal(2), DAG);
+
+ if (!Res)
+ Res = earlyExpandDIVFIX(N, N->getOperand(0), N->getOperand(1),
+ N->getConstantOperandVal(2), TLI, DAG);
SplitInteger(Res, Lo, Hi);
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index c3ca69f826ac..a624228dac0d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -142,7 +142,7 @@ class VectorLegalizer {
void ExpandUADDSUBO(SDNode *Node, SmallVectorImpl<SDValue> &Results);
void ExpandSADDSUBO(SDNode *Node, SmallVectorImpl<SDValue> &Results);
void ExpandMULO(SDNode *Node, SmallVectorImpl<SDValue> &Results);
- SDValue ExpandFixedPointDiv(SDNode *Node);
+ void ExpandFixedPointDiv(SDNode *Node, SmallVectorImpl<SDValue> &Results);
SDValue ExpandStrictFPOp(SDNode *Node);
void ExpandStrictFPOp(SDNode *Node, SmallVectorImpl<SDValue> &Results);
@@ -463,7 +463,9 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
case ISD::UMULFIX:
case ISD::UMULFIXSAT:
case ISD::SDIVFIX:
- case ISD::UDIVFIX: {
+ case ISD::SDIVFIXSAT:
+ case ISD::UDIVFIX:
+ case ISD::UDIVFIXSAT: {
unsigned Scale = Node->getConstantOperandVal(2);
Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
Node->getValueType(0), Scale);
@@ -968,8 +970,11 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
break;
case ISD::SDIVFIX:
case ISD::UDIVFIX:
- Results.push_back(ExpandFixedPointDiv(Node));
+ ExpandFixedPointDiv(Node, Results);
return;
+ case ISD::SDIVFIXSAT:
+ case ISD::UDIVFIXSAT:
+ break;
#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \
case ISD::STRICT_##DAGN:
#include "llvm/IR/ConstrainedOps.def"
@@ -1454,12 +1459,12 @@ void VectorLegalizer::ExpandMULO(SDNode *Node,
Results.push_back(Overflow);
}
-SDValue VectorLegalizer::ExpandFixedPointDiv(SDNode *Node) {
+void VectorLegalizer::ExpandFixedPointDiv(SDNode *Node,
+ SmallVectorImpl<SDValue> &Results) {
SDNode *N = Node;
if (SDValue Expanded = TLI.expandFixedPointDiv(N->getOpcode(), SDLoc(N),
N->getOperand(0), N->getOperand(1), N->getConstantOperandVal(2), DAG))
- return Expanded;
- return DAG.UnrollVectorOp(N);
+ Results.push_back(Expanded);
}
void VectorLegalizer::ExpandStrictFPOp(SDNode *Node,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 862b11c54a76..5e4c35263b7f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -166,7 +166,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
case ISD::UMULFIX:
case ISD::UMULFIXSAT:
case ISD::SDIVFIX:
+ case ISD::SDIVFIXSAT:
case ISD::UDIVFIX:
+ case ISD::UDIVFIXSAT:
R = ScalarizeVecRes_FIX(N);
break;
}
@@ -956,7 +958,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
case ISD::UMULFIX:
case ISD::UMULFIXSAT:
case ISD::SDIVFIX:
+ case ISD::SDIVFIXSAT:
case ISD::UDIVFIX:
+ case ISD::UDIVFIXSAT:
SplitVecRes_FIX(N, Lo, Hi);
break;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 7cf3e43088f2..002610ea24f2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5451,7 +5451,8 @@ static SDValue expandDivFix(unsigned Opcode, const SDLoc &DL,
SDValue LHS, SDValue RHS, SDValue Scale,
SelectionDAG &DAG, const TargetLowering &TLI) {
EVT VT = LHS.getValueType();
- bool Signed = Opcode == ISD::SDIVFIX;
+ bool Signed = Opcode == ISD::SDIVFIX || Opcode == ISD::SDIVFIXSAT;
+ bool Saturating = Opcode == ISD::SDIVFIXSAT || Opcode == ISD::UDIVFIXSAT;
LLVMContext &Ctx = *DAG.getContext();
// If the type is legal but the operation isn't, this node might survive all
@@ -5463,14 +5464,16 @@ static SDValue expandDivFix(unsigned Opcode, const SDLoc &DL,
// by bumping the size by one bit. This will force it to Promote, enabling the
// early expansion and avoiding the need to expand later.
- // We don't have to do this if Scale is 0; that can always be expanded.
+ // We don't have to do this if Scale is 0; that can always be expanded, unless
+ // it's a saturating signed operation. Those can experience true integer
+ // division overflow, a case which we must avoid.
// FIXME: We wouldn't have to do this (or any of the early
// expansion/promotion) if it was possible to expand a libcall of an
// illegal type during operation legalization. But it's not, so things
// get a bit hacky.
unsigned ScaleInt = cast<ConstantSDNode>(Scale)->getZExtValue();
- if (ScaleInt > 0 &&
+ if ((ScaleInt > 0 || (Saturating && Signed)) &&
(TLI.isTypeLegal(VT) ||
(VT.isVector() && TLI.isTypeLegal(VT.getVectorElementType())))) {
TargetLowering::LegalizeAction Action = TLI.getFixedPointOperationAction(
@@ -5492,8 +5495,16 @@ static SDValue expandDivFix(unsigned Opcode, const SDLoc &DL,
LHS = DAG.getZExtOrTrunc(LHS, DL, PromVT);
RHS = DAG.getZExtOrTrunc(RHS, DL, PromVT);
}
- // TODO: Saturation.
+ EVT ShiftTy = TLI.getShiftAmountTy(PromVT, DAG.getDataLayout());
+ // For saturating operations, we need to shift up the LHS to get the
+ // proper saturation width, and then shift down again afterwards.
+ if (Saturating)
+ LHS = DAG.getNode(ISD::SHL, DL, PromVT, LHS,
+ DAG.getConstant(1, DL, ShiftTy));
SDValue Res = DAG.getNode(Opcode, DL, PromVT, LHS, RHS, Scale);
+ if (Saturating)
+ Res = DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, PromVT, Res,
+ DAG.getConstant(1, DL, ShiftTy));
return DAG.getZExtOrTrunc(Res, DL, VT);
}
}
@@ -5757,6 +5768,10 @@ static unsigned FixedPointIntrinsicToOpcode(unsigned Intrinsic) {
return ISD::SDIVFIX;
case Intrinsic::udiv_fix:
return ISD::UDIVFIX;
+ case Intrinsic::sdiv_fix_sat:
+ return ISD::SDIVFIXSAT;
+ case Intrinsic::udiv_fix_sat:
+ return ISD::UDIVFIXSAT;
default:
llvm_unreachable("Unhandled fixed point intrinsic");
}
@@ -6460,7 +6475,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
return;
}
case Intrinsic::sdiv_fix:
- case Intrinsic::udiv_fix: {
+ case Intrinsic::udiv_fix:
+ case Intrinsic::sdiv_fix_sat:
+ case Intrinsic::udiv_fix_sat: {
SDValue Op1 = getValue(I.getArgOperand(0));
SDValue Op2 = getValue(I.getArgOperand(1));
SDValue Op3 = getValue(I.getArgOperand(2));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index f57852ced660..0fd132f03af8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -314,7 +314,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
case ISD::UMULFIXSAT: return "umulfixsat";
case ISD::SDIVFIX: return "sdivfix";
+ case ISD::SDIVFIXSAT: return "sdivfixsat";
case ISD::UDIVFIX: return "udivfix";
+ case ISD::UDIVFIXSAT: return "udivfixsat";
// Conversion operators.
case ISD::SIGN_EXTEND: return "sign_extend";
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 1e345caa1e17..f3f9c6dd7003 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7332,12 +7332,13 @@ SDValue
TargetLowering::expandFixedPointDiv(unsigned Opcode, const SDLoc &dl,
SDValue LHS, SDValue RHS,
unsigned Scale, SelectionDAG &DAG) const {
- assert((Opcode == ISD::SDIVFIX ||
- Opcode == ISD::UDIVFIX) &&
+ assert((Opcode == ISD::SDIVFIX || Opcode == ISD::SDIVFIXSAT ||
+ Opcode == ISD::UDIVFIX || Opcode == ISD::UDIVFIXSAT) &&
"Expected a fixed point division opcode");
EVT VT = LHS.getValueType();
- bool Signed = Opcode == ISD::SDIVFIX;
+ bool Signed = Opcode == ISD::SDIVFIX || Opcode == ISD::SDIVFIXSAT;
+ bool Saturating = Opcode == ISD::SDIVFIXSAT || Opcode == ISD::UDIVFIXSAT;
EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
// If there is enough room in the type to upscale the LHS or downscale the
@@ -7349,7 +7350,15 @@ TargetLowering::expandFixedPointDiv(unsigned Opcode, const SDLoc &dl,
: DAG.computeKnownBits(LHS).countMinLeadingZeros();
unsigned RHSTrail = DAG.computeKnownBits(RHS).countMinTrailingZeros();
- if (LHSLead + RHSTrail < Scale)
+ // For signed saturating operations, we need to be able to detect true integer
+ // division overflow; that is, when you have MIN / -EPS. However, this
+ // is undefined behavior and if we emit divisions that could take such
+ // values it may cause undesired behavior (arithmetic exceptions on x86, for
+ // example).
+ // Avoid this by requiring an extra bit so that we never get this case.
+ // FIXME: This is a bit unfortunate as it means that for an 8-bit 7-scale
+ // signed saturating division, we need to emit a whopping 32-bit division.
+ if (LHSLead + RHSTrail < Scale + (unsigned)(Saturating && Signed))
return SDValue();
unsigned LHSShift = std::min(LHSLead, Scale);
@@ -7403,8 +7412,6 @@ TargetLowering::expandFixedPointDiv(unsigned Opcode, const SDLoc &dl,
Quot = DAG.getNode(ISD::UDIV, dl, VT,
LHS, RHS);
- // TODO: Saturation.
-
return Quot;
}
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index bd717a8585ee..95c63d09718c 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -660,7 +660,9 @@ void TargetLoweringBase::initActions() {
setOperationAction(ISD::UMULFIX, VT, Expand);
setOperationAction(ISD::UMULFIXSAT, VT, Expand);
setOperationAction(ISD::SDIVFIX, VT, Expand);
+ setOperationAction(ISD::SDIVFIXSAT, VT, Expand);
setOperationAction(ISD::UDIVFIX, VT, Expand);
+ setOperationAction(ISD::UDIVFIXSAT, VT, Expand);
// Overflow operations default to expand
setOperationAction(ISD::SADDO, VT, Expand);
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 6af581b178dc..fefb3a4751c3 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4727,7 +4727,9 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
case Intrinsic::umul_fix:
case Intrinsic::umul_fix_sat:
case Intrinsic::sdiv_fix:
- case Intrinsic::udiv_fix: {
+ case Intrinsic::sdiv_fix_sat:
+ case Intrinsic::udiv_fix:
+ case Intrinsic::udiv_fix_sat: {
Value *Op1 = Call.getArgOperand(0);
Value *Op2 = Call.getArgOperand(1);
Assert(Op1->getType()->isIntOrIntVectorTy(),
@@ -4742,7 +4744,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
"third argument of [us][mul|div]_fix[_sat] must fit within 32 bits");
if (ID == Intrinsic::smul_fix || ID == Intrinsic::smul_fix_sat ||
- ID == Intrinsic::sdiv_fix) {
+ ID == Intrinsic::sdiv_fix || ID == Intrinsic::sdiv_fix_sat) {
Assert(
Op3->getZExtValue() < Op1->getType()->getScalarSizeInBits(),
"the scale of s[mul|div]_fix[_sat] must be less than the width of "
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
new file mode 100644
index 000000000000..e35130dcb535
--- /dev/null
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -0,0 +1,1411 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86
+
+declare i4 @llvm.sdiv.fix.sat.i4 (i4, i4, i32)
+declare i15 @llvm.sdiv.fix.sat.i15 (i15, i15, i32)
+declare i16 @llvm.sdiv.fix.sat.i16 (i16, i16, i32)
+declare i18 @llvm.sdiv.fix.sat.i18 (i18, i18, i32)
+declare i64 @llvm.sdiv.fix.sat.i64 (i64, i64, i32)
+declare <4 x i32> @llvm.sdiv.fix.sat.v4i32(<4 x i32>, <4 x i32>, i32)
+
+define i16 @func(i16 %x, i16 %y) nounwind {
+;
+; X64-LABEL: func:
+; X64: # %bb.0:
+; X64-NEXT: movswl %si, %esi
+; X64-NEXT: movswl %di, %ecx
+; X64-NEXT: shll $8, %ecx
+; X64-NEXT: movl %ecx, %eax
+; X64-NEXT: cltd
+; X64-NEXT: idivl %esi
+; X64-NEXT: # kill: def $eax killed $eax def $rax
+; X64-NEXT: leal -1(%rax), %edi
+; X64-NEXT: testl %esi, %esi
+; X64-NEXT: sets %sil
+; X64-NEXT: testl %ecx, %ecx
+; X64-NEXT: sets %cl
+; X64-NEXT: xorb %sil, %cl
+; X64-NEXT: testl %edx, %edx
+; X64-NEXT: setne %dl
+; X64-NEXT: testb %cl, %dl
+; X64-NEXT: cmovel %eax, %edi
+; X64-NEXT: cmpl $65535, %edi # imm = 0xFFFF
+; X64-NEXT: movl $65535, %ecx # imm = 0xFFFF
+; X64-NEXT: cmovll %edi, %ecx
+; X64-NEXT: cmpl $-65536, %ecx # imm = 0xFFFF0000
+; X64-NEXT: movl $-65536, %eax # imm = 0xFFFF0000
+; X64-NEXT: cmovgl %ecx, %eax
+; X64-NEXT: shrl %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+;
+; X86-LABEL: func:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movswl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: shll $8, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: cltd
+; X86-NEXT: idivl %esi
+; X86-NEXT: leal -1(%eax), %edi
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: sets %bl
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: sets %cl
+; X86-NEXT: xorb %bl, %cl
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: setne %dl
+; X86-NEXT: testb %cl, %dl
+; X86-NEXT: cmovel %eax, %edi
+; X86-NEXT: cmpl $65535, %edi # imm = 0xFFFF
+; X86-NEXT: movl $65535, %ecx # imm = 0xFFFF
+; X86-NEXT: cmovll %edi, %ecx
+; X86-NEXT: cmpl $-65536, %ecx # imm = 0xFFFF0000
+; X86-NEXT: movl $-65536, %eax # imm = 0xFFFF0000
+; X86-NEXT: cmovgl %ecx, %eax
+; X86-NEXT: shrl %eax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: retl
+ %tmp = call i16 @llvm.sdiv.fix.sat.i16(i16 %x, i16 %y, i32 7)
+ ret i16 %tmp
+}
+
+define i16 @func2(i8 %x, i8 %y) nounwind {
+;
+; X64-LABEL: func2:
+; X64: # %bb.0:
+; X64-NEXT: movsbl %dil, %eax
+; X64-NEXT: movsbl %sil, %ecx
+; X64-NEXT: movswl %cx, %esi
+; X64-NEXT: movswl %ax, %ecx
+; X64-NEXT: shll $14, %ecx
+; X64-NEXT: movl %ecx, %eax
+; X64-NEXT: cltd
+; X64-NEXT: idivl %esi
+; X64-NEXT: # kill: def $eax killed $eax def $rax
+; X64-NEXT: leal -1(%rax), %edi
+; X64-NEXT: testl %esi, %esi
+; X64-NEXT: sets %sil
+; X64-NEXT: testl %ecx, %ecx
+; X64-NEXT: sets %cl
+; X64-NEXT: xorb %sil, %cl
+; X64-NEXT: testl %edx, %edx
+; X64-NEXT: setne %dl
+; X64-NEXT: testb %cl, %dl
+; X64-NEXT: cmovel %eax, %edi
+; X64-NEXT: cmpl $16383, %edi # imm = 0x3FFF
+; X64-NEXT: movl $16383, %ecx # imm = 0x3FFF
+; X64-NEXT: cmovll %edi, %ecx
+; X64-NEXT: cmpl $-16384, %ecx # imm = 0xC000
+; X64-NEXT: movl $-16384, %eax # imm = 0xC000
+; X64-NEXT: cmovgl %ecx, %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+;
+; X86-LABEL: func2:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movsbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: shll $14, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: cltd
+; X86-NEXT: idivl %esi
+; X86-NEXT: leal -1(%eax), %edi
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: sets %bl
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: sets %cl
+; X86-NEXT: xorb %bl, %cl
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: setne %dl
+; X86-NEXT: testb %cl, %dl
+; X86-NEXT: cmovel %eax, %edi
+; X86-NEXT: cmpl $16383, %edi # imm = 0x3FFF
+; X86-NEXT: movl $16383, %ecx # imm = 0x3FFF
+; X86-NEXT: cmovll %edi, %ecx
+; X86-NEXT: cmpl $-16384, %ecx # imm = 0xC000
+; X86-NEXT: movl $-16384, %eax # imm = 0xC000
+; X86-NEXT: cmovgl %ecx, %eax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: retl
+ %x2 = sext i8 %x to i15
+ %y2 = sext i8 %y to i15
+ %tmp = call i15 @llvm.sdiv.fix.sat.i15(i15 %x2, i15 %y2, i32 14)
+ %tmp2 = sext i15 %tmp to i16
+ ret i16 %tmp2
+}
+
+define i16 @func3(i15 %x, i8 %y) nounwind {
+;
+; X64-LABEL: func3:
+; X64: # %bb.0:
+; X64-NEXT: shll $8, %esi
+; X64-NEXT: movswl %si, %ecx
+; X64-NEXT: addl %edi, %edi
+; X64-NEXT: shrl $4, %ecx
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: cwtd
+; X64-NEXT: idivw %cx
+; X64-NEXT: # kill: def $ax killed $ax def $rax
+; X64-NEXT: leal -1(%rax), %esi
+; X64-NEXT: testw %di, %di
+; X64-NEXT: sets %dil
+; X64-NEXT: testw %cx, %cx
+; X64-NEXT: sets %cl
+; X64-NEXT: xorb %dil, %cl
+; X64-NEXT: testw %dx, %dx
+; X64-NEXT: setne %dl
+; X64-NEXT: testb %cl, %dl
+; X64-NEXT: cmovel %eax, %esi
+; X64-NEXT: movswl %si, %eax
+; X64-NEXT: cmpl $16383, %eax # imm = 0x3FFF
+; X64-NEXT: movl $16383, %ecx # imm = 0x3FFF
+; X64-NEXT: cmovll %esi, %ecx
+; X64-NEXT: movswl %cx, %eax
+; X64-NEXT: cmpl $-16384, %eax # imm = 0xC000
+; X64-NEXT: movl $49152, %eax # imm = 0xC000
+; X64-NEXT: cmovgl %ecx, %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+;
+; X86-LABEL: func3:
+; X86: # %bb.0:
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shll $8, %eax
+; X86-NEXT: movswl %ax, %esi
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: shrl $4, %esi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: cwtd
+; X86-NEXT: idivw %si
+; X86-NEXT: # kill: def $ax killed $ax def $eax
+; X86-NEXT: leal -1(%eax), %edi
+; X86-NEXT: testw %cx, %cx
+; X86-NEXT: sets %cl
+; X86-NEXT: testw %si, %si
+; X86-NEXT: sets %ch
+; X86-NEXT: xorb %cl, %ch
+; X86-NEXT: testw %dx, %dx
+; X86-NEXT: setne %cl
+; X86-NEXT: testb %ch, %cl
+; X86-NEXT: cmovel %eax, %edi
+; X86-NEXT: movswl %di, %eax
+; X86-NEXT: cmpl $16383, %eax # imm = 0x3FFF
+; X86-NEXT: movl $16383, %ecx # imm = 0x3FFF
+; X86-NEXT: cmovll %edi, %ecx
+; X86-NEXT: movswl %cx, %eax
+; X86-NEXT: cmpl $-16384, %eax # imm = 0xC000
+; X86-NEXT: movl $49152, %eax # imm = 0xC000
+; X86-NEXT: cmovgl %ecx, %eax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: retl
+ %y2 = sext i8 %y to i15
+ %y3 = shl i15 %y2, 7
+ %tmp = call i15 @llvm.sdiv.fix.sat.i15(i15 %x, i15 %y3, i32 4)
+ %tmp2 = sext i15 %tmp to i16
+ ret i16 %tmp2
+}
+
+define i4 @func4(i4 %x, i4 %y) nounwind {
+;
+; X64-LABEL: func4:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rbx
+; X64-NEXT: shlb $4, %sil
+; X64-NEXT: sarb $4, %sil
+; X64-NEXT: shlb $4, %dil
+; X64-NEXT: sarb $4, %dil
+; X64-NEXT: shlb $2, %dil
+; X64-NEXT: movsbl %dil, %ecx
+; X64-NEXT: movl %ecx, %eax
+; X64-NEXT: idivb %sil
+; X64-NEXT: movsbl %ah, %ebx
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: leal -1(%rax), %edi
+; X64-NEXT: movzbl %dil, %edi
+; X64-NEXT: testb %sil, %sil
+; X64-NEXT: sets %dl
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: sets %cl
+; X64-NEXT: xorb %dl, %cl
+; X64-NEXT: testb %bl, %bl
+; X64-NEXT: setne %dl
+; X64-NEXT: testb %cl, %dl
+; X64-NEXT: cmovel %eax, %edi
+; X64-NEXT: cmpb $7, %dil
+; X64-NEXT: movl $7, %ecx
+; X64-NEXT: cmovll %edi, %ecx
+; X64-NEXT: cmpb $-8, %cl
+; X64-NEXT: movl $248, %eax
+; X64-NEXT: cmovgl %ecx, %eax
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: popq %rbx
+; X64-NEXT: retq
+;
+; X86-LABEL: func4:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movb {{[0-9]+}}(%esp), %dl
+; X86-NEXT: shlb $4, %dl
+; X86-NEXT: sarb $4, %dl
+; X86-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-NEXT: shlb $4, %dh
+; X86-NEXT: sarb $4, %dh
+; X86-NEXT: shlb $2, %dh
+; X86-NEXT: movsbl %dh, %eax
+; X86-NEXT: idivb %dl
+; X86-NEXT: movsbl %ah, %ecx
+; X86-NEXT: movzbl %al, %esi
+; X86-NEXT: decb %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: testb %dl, %dl
+; X86-NEXT: sets %dl
+; X86-NEXT: testb %dh, %dh
+; X86-NEXT: sets %dh
+; X86-NEXT: xorb %dl, %dh
+; X86-NEXT: testb %cl, %cl
+; X86-NEXT: setne %cl
+; X86-NEXT: testb %dh, %cl
+; X86-NEXT: cmovel %esi, %eax
+; X86-NEXT: cmpb $7, %al
+; X86-NEXT: movl $7, %ecx
+; X86-NEXT: cmovll %eax, %ecx
+; X86-NEXT: cmpb $-8, %cl
+; X86-NEXT: movl $248, %eax
+; X86-NEXT: cmovgl %ecx, %eax
+; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+ %tmp = call i4 @llvm.sdiv.fix.sat.i4(i4 %x, i4 %y, i32 2)
+ ret i4 %tmp
+}
+
+define i64 @func5(i64 %x, i64 %y) nounwind {
+;
+; X64-LABEL: func5:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rbp
+; X64-NEXT: pushq %r15
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %r13
+; X64-NEXT: pushq %r12
+; X64-NEXT: pushq %rbx
+; X64-NEXT: subq $40, %rsp
+; X64-NEXT: movq %rdi, %r15
+; X64-NEXT: leaq (%rdi,%rdi), %rax
+; X64-NEXT: shrq $33, %rax
+; X64-NEXT: movq %rdi, %r12
+; X64-NEXT: sarq $63, %r12
+; X64-NEXT: shlq $31, %r12
+; X64-NEXT: orq %rax, %r12
+; X64-NEXT: sets {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; X64-NEXT: shlq $32, %r15
+; X64-NEXT: movq %rsi, %rdx
+; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rsi, %r13
+; X64-NEXT: sarq $63, %r13
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: movq %r12, %rsi
+; X64-NEXT: movq %r13, %rcx
+; X64-NEXT: callq __divti3
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rdx, %rbp
+; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: subq $1, %rbx
+; X64-NEXT: sbbq $0, %rbp
+; X64-NEXT: testq %r13, %r13
+; X64-NEXT: sets %r14b
+; X64-NEXT: xorb {{[-0-9]+}}(%r{{[sb]}}p), %r14b # 1-byte Folded Reload
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: movq %r12, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: movq %r13, %rcx
+; X64-NEXT: callq __modti3
+; X64-NEXT: orq %rax, %rdx
+; X64-NEXT: setne %al
+; X64-NEXT: testb %r14b, %al
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; X64-NEXT: cmpq $-1, %rbx
+; X64-NEXT: movq $-1, %rax
+; X64-NEXT: movq $-1, %rcx
+; X64-NEXT: cmovbq %rbx, %rcx
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: testq %rbp, %rbp
+; X64-NEXT: cmovnsq %rax, %rbx
+; X64-NEXT: cmoveq %rcx, %rbx
+; X64-NEXT: cmovnsq %rdx, %rbp
+; X64-NEXT: testq %rbx, %rbx
+; X64-NEXT: movl $0, %ecx
+; X64-NEXT: cmovaq %rbx, %rcx
+; X64-NEXT: testq %rbp, %rbp
+; X64-NEXT: cmovnsq %rbp, %rax
+; X64-NEXT: cmovsq %rdx, %rbx
+; X64-NEXT: cmpq $-1, %rbp
+; X64-NEXT: cmoveq %rcx, %rbx
+; X64-NEXT: shrdq $1, %rax, %rbx
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: addq $40, %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %r12
+; X64-NEXT: popq %r13
+; X64-NEXT: popq %r14
+; X64-NEXT: popq %r15
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+;
+; X86-LABEL: func5:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-8, %esp
+; X86-NEXT: subl $88, %esp
+; X86-NEXT: movl 8(%ebp), %ecx
+; X86-NEXT: movl 12(%ebp), %eax
+; X86-NEXT: movl 20(%ebp), %ebx
+; X86-NEXT: sarl $31, %ebx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: sarl $31, %edi
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $31, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $31, %ecx, %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll $31, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl 20(%ebp)
+; X86-NEXT: pushl 16(%ebp)
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %edx
+; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl %eax
+; X86-NEXT: calll __divti3
+; X86-NEXT: addl $32, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: subl $1, %esi
+; X86-NEXT: sbbl $0, %edi
+; X86-NEXT: sbbl $0, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl $0, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: sets %al
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: sets %ah
+; X86-NEXT: xorb %al, %ah
+; X86-NEXT: movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: pushl %edx
+; X86-NEXT: pushl %edx
+; X86-NEXT: pushl 20(%ebp)
+; X86-NEXT: pushl 16(%ebp)
+; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: pushl %eax
+; X86-NEXT: calll __modti3
+; X86-NEXT: addl $32, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: setne %al
+; X86-NEXT: testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: cmovsl %ebx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $-1, %ecx
+; X86-NEXT: cmovsl %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT: cmovsl %edi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: sarl $31, %edx
+; X86-NEXT: andl %eax, %edx
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: cmovel %ebx, %edx
+; X86-NEXT: cmpl $-1, %esi
+; X86-NEXT: movl $-1, %eax
+; X86-NEXT: cmovbl %esi, %eax
+; X86-NEXT: cmpl $2147483647, %edi # imm = 0x7FFFFFFF
+; X86-NEXT: movl $-1, %ecx
+; X86-NEXT: cmovael %ecx, %esi
+; X86-NEXT: cmovel %eax, %esi
+; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT: cmovael %eax, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: cmoval %esi, %eax
+; X86-NEXT: cmpl $-2147483648, %edi # imm = 0x80000000
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: cmoval %esi, %ecx
+; X86-NEXT: cmovel %eax, %ecx
+; X86-NEXT: movl $-2147483648, %eax # imm = 0x80000000
+; X86-NEXT: cmoval %edi, %eax
+; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl $-2147483648, %ebx # imm = 0x80000000
+; X86-NEXT: cmovsl %ebx, %edi
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: cmovsl %ebx, %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: cmpl $-1, %edx
+; X86-NEXT: cmovel %ecx, %esi
+; X86-NEXT: cmovel %eax, %edi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+ %tmp = call i64 @llvm.sdiv.fix.sat.i64(i64 %x, i64 %y, i32 31)
+ ret i64 %tmp
+}
+
+define i18 @func6(i16 %x, i16 %y) nounwind {
+;
+; X64-LABEL: func6:
+; X64: # %bb.0:
+; X64-NEXT: movswl %di, %ecx
+; X64-NEXT: movswl %si, %esi
+; X64-NEXT: shll $7, %ecx
+; X64-NEXT: movl %ecx, %eax
+; X64-NEXT: cltd
+; X64-NEXT: idivl %esi
+; X64-NEXT: # kill: def $eax killed $eax def $rax
+; X64-NEXT: leal -1(%rax), %edi
+; X64-NEXT: testl %esi, %esi
+; X64-NEXT: sets %sil
+; X64-NEXT: testl %ecx, %ecx
+; X64-NEXT: sets %cl
+; X64-NEXT: xorb %sil, %cl
+; X64-NEXT: testl %edx, %edx
+; X64-NEXT: setne %dl
+; X64-NEXT: testb %cl, %dl
+; X64-NEXT: cmovel %eax, %edi
+; X64-NEXT: cmpl $131071, %edi # imm = 0x1FFFF
+; X64-NEXT: movl $131071, %ecx # imm = 0x1FFFF
+; X64-NEXT: cmovll %edi, %ecx
+; X64-NEXT: cmpl $-131072, %ecx # imm = 0xFFFE0000
+; X64-NEXT: movl $-131072, %eax # imm = 0xFFFE0000
+; X64-NEXT: cmovgl %ecx, %eax
+; X64-NEXT: retq
+;
+; X86-LABEL: func6:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movswl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: shll $7, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: cltd
+; X86-NEXT: idivl %esi
+; X86-NEXT: leal -1(%eax), %edi
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: sets %bl
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: sets %cl
+; X86-NEXT: xorb %bl, %cl
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: setne %dl
+; X86-NEXT: testb %cl, %dl
+; X86-NEXT: cmovel %eax, %edi
+; X86-NEXT: cmpl $131071, %edi # imm = 0x1FFFF
+; X86-NEXT: movl $131071, %ecx # imm = 0x1FFFF
+; X86-NEXT: cmovll %edi, %ecx
+; X86-NEXT: cmpl $-131072, %ecx # imm = 0xFFFE0000
+; X86-NEXT: movl $-131072, %eax # imm = 0xFFFE0000
+; X86-NEXT: cmovgl %ecx, %eax
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: retl
+ %x2 = sext i16 %x to i18
+ %y2 = sext i16 %y to i18
+ %tmp = call i18 @llvm.sdiv.fix.sat.i18(i18 %x2, i18 %y2, i32 7)
+ ret i18 %tmp
+}
+
+define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
+;
+; X64-LABEL: vec:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rbp
+; X64-NEXT: pushq %r15
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %r13
+; X64-NEXT: pushq %r12
+; X64-NEXT: pushq %rbx
+; X64-NEXT: subq $104, %rsp
+; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT: pxor %xmm2, %xmm2
+; X64-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: paddq %xmm0, %xmm0
+; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT: movq %xmm0, %rbp
+; X64-NEXT: movq %rbp, %r12
+; X64-NEXT: shrq $33, %r12
+; X64-NEXT: movq %rbp, %r14
+; X64-NEXT: sarq $63, %r14
+; X64-NEXT: shlq $31, %r14
+; X64-NEXT: orq %r14, %r12
+; X64-NEXT: pxor %xmm0, %xmm0
+; X64-NEXT: pcmpgtd %xmm1, %xmm0
+; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT: movq %xmm1, %rdx
+; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: sarq $63, %rbx
+; X64-NEXT: shlq $31, %rbp
+; X64-NEXT: movq %rbp, %rdi
+; X64-NEXT: movq %r12, %rsi
+; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: callq __divti3
+; X64-NEXT: movq %rax, %r13
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: subq $1, %r13
+; X64-NEXT: sbbq $0, %r15
+; X64-NEXT: shrq $63, %r14
+; X64-NEXT: xorl %ebx, %r14d
+; X64-NEXT: movq %rbp, %rdi
+; X64-NEXT: movq %r12, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: callq __modti3
+; X64-NEXT: orq %rax, %rdx
+; X64-NEXT: setne %al
+; X64-NEXT: testb %r14b, %al
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT: movl $4294967295, %edx # imm = 0xFFFFFFFF
+; X64-NEXT: cmpq %rdx, %r13
+; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF
+; X64-NEXT: cmovbq %r13, %rax
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: testq %r15, %r15
+; X64-NEXT: cmovnsq %rdx, %r13
+; X64-NEXT: cmoveq %rax, %r13
+; X64-NEXT: cmovnsq %rcx, %r15
+; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
+; X64-NEXT: cmpq %rcx, %r13
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: cmovaq %r13, %rax
+; X64-NEXT: testq %r15, %r15
+; X64-NEXT: cmovsq %rcx, %r13
+; X64-NEXT: cmpq $-1, %r15
+; X64-NEXT: cmoveq %rax, %r13
+; X64-NEXT: movq %r13, %xmm0
+; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X64-NEXT: # xmm0 = mem[2,3,0,1]
+; X64-NEXT: movq %xmm0, %r13
+; X64-NEXT: movq %r13, %rbx
+; X64-NEXT: shrq $33, %rbx
+; X64-NEXT: movq %r13, %r14
+; X64-NEXT: sarq $63, %r14
+; X64-NEXT: shlq $31, %r14
+; X64-NEXT: orq %r14, %rbx
+; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X64-NEXT: # xmm0 = mem[2,3,0,1]
+; X64-NEXT: movq %xmm0, %rdx
+; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rdx, %rbp
+; X64-NEXT: sarq $63, %rbp
+; X64-NEXT: shlq $31, %r13
+; X64-NEXT: movq %r13, %rdi
+; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: movq %rbp, %rcx
+; X64-NEXT: callq __divti3
+; X64-NEXT: movq %rax, %r12
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: subq $1, %r12
+; X64-NEXT: sbbq $0, %r15
+; X64-NEXT: shrq $63, %r14
+; X64-NEXT: xorl %ebp, %r14d
+; X64-NEXT: movq %r13, %rdi
+; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: movq %rbp, %rcx
+; X64-NEXT: callq __modti3
+; X64-NEXT: orq %rax, %rdx
+; X64-NEXT: setne %al
+; X64-NEXT: testb %r14b, %al
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
+; X64-NEXT: cmpq %rcx, %r12
+; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF
+; X64-NEXT: cmovbq %r12, %rax
+; X64-NEXT: testq %r15, %r15
+; X64-NEXT: cmovnsq %rcx, %r12
+; X64-NEXT: cmoveq %rax, %r12
+; X64-NEXT: movl $0, %eax
+; X64-NEXT: cmovnsq %rax, %r15
+; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
+; X64-NEXT: cmpq %rcx, %r12
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: cmovaq %r12, %rax
+; X64-NEXT: testq %r15, %r15
+; X64-NEXT: cmovsq %rcx, %r12
+; X64-NEXT: cmpq $-1, %r15
+; X64-NEXT: cmoveq %rax, %r12
+; X64-NEXT: movq %r12, %xmm0
+; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64-NEXT: psrlq $1, %xmm1
+; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; X64-NEXT: # xmm1 = mem[2,3,0,1]
+; X64-NEXT: pxor %xmm0, %xmm0
+; X64-NEXT: pcmpgtd %xmm1, %xmm0
+; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT: paddq %xmm1, %xmm1
+; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT: movq %xmm1, %r12
+; X64-NEXT: movq %r12, %rbx
+; X64-NEXT: shrq $33, %rbx
+; X64-NEXT: movq %r12, %r14
+; X64-NEXT: sarq $63, %r14
+; X64-NEXT: shlq $31, %r14
+; X64-NEXT: orq %r14, %rbx
+; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; X64-NEXT: # xmm1 = mem[2,3,0,1]
+; X64-NEXT: pxor %xmm0, %xmm0
+; X64-NEXT: pcmpgtd %xmm1, %xmm0
+; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT: movq %xmm1, %rdx
+; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rdx, %rbp
+; X64-NEXT: sarq $63, %rbp
+; X64-NEXT: shlq $31, %r12
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: movq %rbp, %rcx
+; X64-NEXT: callq __divti3
+; X64-NEXT: movq %rax, %r13
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: subq $1, %r13
+; X64-NEXT: sbbq $0, %r15
+; X64-NEXT: shrq $63, %r14
+; X64-NEXT: xorl %ebp, %r14d
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: movq %rbp, %rcx
+; X64-NEXT: callq __modti3
+; X64-NEXT: orq %rax, %rdx
+; X64-NEXT: setne %al
+; X64-NEXT: testb %r14b, %al
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
+; X64-NEXT: cmpq %rcx, %r13
+; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF
+; X64-NEXT: cmovbq %r13, %rax
+; X64-NEXT: testq %r15, %r15
+; X64-NEXT: cmovnsq %rcx, %r13
+; X64-NEXT: cmoveq %rax, %r13
+; X64-NEXT: movl $0, %eax
+; X64-NEXT: cmovnsq %rax, %r15
+; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
+; X64-NEXT: cmpq %rcx, %r13
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: cmovaq %r13, %rax
+; X64-NEXT: testq %r15, %r15
+; X64-NEXT: cmovsq %rcx, %r13
+; X64-NEXT: cmpq $-1, %r15
+; X64-NEXT: cmoveq %rax, %r13
+; X64-NEXT: movq %r13, %xmm0
+; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X64-NEXT: # xmm0 = mem[2,3,0,1]
+; X64-NEXT: movq %xmm0, %r13
+; X64-NEXT: movq %r13, %rbx
+; X64-NEXT: shrq $33, %rbx
+; X64-NEXT: movq %r13, %r14
+; X64-NEXT: sarq $63, %r14
+; X64-NEXT: shlq $31, %r14
+; X64-NEXT: orq %r14, %rbx
+; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X64-NEXT: # xmm0 = mem[2,3,0,1]
+; X64-NEXT: movq %xmm0, %rdx
+; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rdx, %rbp
+; X64-NEXT: sarq $63, %rbp
+; X64-NEXT: shlq $31, %r13
+; X64-NEXT: movq %r13, %rdi
+; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: movq %rbp, %rcx
+; X64-NEXT: callq __divti3
+; X64-NEXT: movq %rax, %r12
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: subq $1, %r12
+; X64-NEXT: sbbq $0, %r15
+; X64-NEXT: shrq $63, %r14
+; X64-NEXT: xorl %ebp, %r14d
+; X64-NEXT: movq %r13, %rdi
+; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: movq %rbp, %rcx
+; X64-NEXT: callq __modti3
+; X64-NEXT: orq %rax, %rdx
+; X64-NEXT: setne %al
+; X64-NEXT: testb %r14b, %al
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
+; X64-NEXT: cmpq %rcx, %r12
+; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF
+; X64-NEXT: cmovbq %r12, %rax
+; X64-NEXT: testq %r15, %r15
+; X64-NEXT: cmovnsq %rcx, %r12
+; X64-NEXT: cmoveq %rax, %r12
+; X64-NEXT: movl $0, %eax
+; X64-NEXT: cmovnsq %rax, %r15
+; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
+; X64-NEXT: cmpq %rcx, %r12
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: cmovaq %r12, %rax
+; X64-NEXT: testq %r15, %r15
+; X64-NEXT: cmovsq %rcx, %r12
+; X64-NEXT: cmpq $-1, %r15
+; X64-NEXT: cmoveq %rax, %r12
+; X64-NEXT: movq %r12, %xmm0
+; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64-NEXT: psrlq $1, %xmm1
+; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-NEXT: addq $104, %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %r12
+; X64-NEXT: popq %r13
+; X64-NEXT: popq %r14
+; X64-NEXT: popq %r15
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+;
+; X86-LABEL: vec:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-8, %esp
+; X86-NEXT: subl $256, %esp # imm = 0x100
+; X86-NEXT: movl 24(%ebp), %ecx
+; X86-NEXT: movl 40(%ebp), %ebx
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: sarl $31, %edx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: sarl $31, %eax
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: adcl %eax, %eax
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: shll $31, %eax
+; X86-NEXT: shrl %ecx
+; X86-NEXT: subl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shll $31, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: negl %edi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: pushl %edx
+; X86-NEXT: pushl %edx
+; X86-NEXT: pushl %edx
+; X86-NEXT: pushl %ebx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %eax
+; X86-NEXT: calll __modti3
+; X86-NEXT: addl $32, %esp
+; X86-NEXT: movl 36(%ebp), %edi
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: sarl $31, %edx
+; X86-NEXT: movl 20(%ebp), %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: sarl $31, %eax
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: adcl %eax, %eax
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: shll $31, %eax
+; X86-NEXT: shrl %ecx
+; X86-NEXT: subl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shll $31, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: negl %ebx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: pushl %edx
+; X86-NEXT: pushl %edx
+; X86-NEXT: pushl %edx
+; X86-NEXT: pushl %edi
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %eax
+; X86-NEXT: calll __modti3
+; X86-NEXT: addl $32, %esp
+; X86-NEXT: movl 28(%ebp), %edi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: sarl $31, %ebx
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: sarl $31, %eax
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: adcl %eax, %eax
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: shll $31, %eax
+; X86-NEXT: shrl %ecx
+; X86-NEXT: subl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shll $31, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: negl %esi
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl %edx
+; X86-NEXT: pushl %eax
+; X86-NEXT: calll __divti3
+; X86-NEXT: addl $32, %esp
+; X86-NEXT: movl 32(%ebp), %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: sarl $31, %edi
+; X86-NEXT: movl 16(%ebp), %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: sarl $31, %eax
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: adcl %eax, %eax
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: shll $31, %eax
+; X86-NEXT: shrl %ecx
+; X86-NEXT: subl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shll $31, %ebx
+; X86-NEXT: negl %esi
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %edx
+; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %eax
+; X86-NEXT: calll __modti3
+; X86-NEXT: addl $32, %esp
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl 32(%ebp)
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %esi
+; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %eax
+; X86-NEXT: calll __divti3
+; X86-NEXT: addl $32, %esp
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl 40(%ebp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: pushl %eax
+; X86-NEXT: calll __divti3
+; X86-NEXT: addl $32, %esp
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl 36(%ebp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl %ecx
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: pushl %eax
+; X86-NEXT: calll __divti3
+; X86-NEXT: addl $32, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: subl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: sbbl $0, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl $0, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl $0, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: sets %bl
+; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: sets %bh
+; X86-NEXT: xorb %bl, %bh
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: orl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: setne %al
+; X86-NEXT: testb %bh, %al
+; X86-NEXT: cmovel %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: subl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: sbbl $0, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl $0, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: sbbl $0, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: sets %bl
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: sets %bh
+; X86-NEXT: xorb %bl, %bh
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: orl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: setne %al
+; X86-NEXT: testb %bh, %al
+; X86-NEXT: cmovel %edx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: cmovel %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: subl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: sbbl $0, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl $0, %eax
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl $0, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: sets %al
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: sets %bl
+; X86-NEXT: xorb %al, %bl
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl 28(%ebp)
+; X86-NEXT: pushl %edx
+; X86-NEXT: pushl %edx
+; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: pushl %eax
+; X86-NEXT: calll __modti3
+; X86-NEXT: addl $32, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: setne %al
+; X86-NEXT: testb %bl, %al
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: cmovel %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: subl $1, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: sbbl $0, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl $0, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl $0, %ecx
+; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: sets %bl
+; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: sets %bh
+; X86-NEXT: xorb %bl, %bh
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: orl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: setne %al
+; X86-NEXT: testb %bh, %al
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: cmovel %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: cmovsl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $-1, %eax
+; X86-NEXT: cmovsl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: sarl $31, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: cmovel %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: cmovsl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $-1, %eax
+; X86-NEXT: cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: sarl $31, %ebx
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: cmovel %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: cmovsl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $-1, %eax
+; X86-NEXT: cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: cmovel %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: cmovsl %eax, %edi
+; X86-NEXT: movl $-1, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: cmovsl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: sarl $31, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: cmovel %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: cmpl $-1, %edx
+; X86-NEXT: movl $-1, %eax
+; X86-NEXT: cmovael %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: cmpl $1, %edx
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: cmovbl %edx, %ecx
+; X86-NEXT: andl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: cmovel %ecx, %esi
+; X86-NEXT: cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: cmoval %eax, %ecx
+; X86-NEXT: cmpl $-1, %esi
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: cmovnel %edx, %ecx
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: movl $-1, %edx
+; X86-NEXT: cmovsl %edx, %esi
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: cmovsl %edx, %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: cmpl $-1, %edi
+; X86-NEXT: cmovel %ecx, %eax
+; X86-NEXT: cmovnel %esi, %edi
+; X86-NEXT: shldl $31, %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: cmpl $-1, %eax
+; X86-NEXT: movl $-1, %ecx
+; X86-NEXT: cmovael %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: cmpl $1, %esi
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: cmovbl %esi, %ecx
+; X86-NEXT: andl %esi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: cmovel %ecx, %ebx
+; X86-NEXT: cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: cmoval %eax, %ecx
+; X86-NEXT: cmpl $-1, %ebx
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: cmovnel %edi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: movl $-1, %edx
+; X86-NEXT: cmovsl %edx, %ebx
+; X86-NEXT: cmovsl %edi, %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: cmpl $-1, %esi
+; X86-NEXT: cmovel %ecx, %eax
+; X86-NEXT: cmovnel %ebx, %esi
+; X86-NEXT: shldl $31, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: cmpl $-1, %eax
+; X86-NEXT: cmovael %edx, %eax
+; X86-NEXT: movl $-1, %ebx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: cmpl $1, %edx
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: cmovbl %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: cmovel %ecx, %edi
+; X86-NEXT: cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: cmoval %eax, %ecx
+; X86-NEXT: cmpl $-1, %edi
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: cmovnel %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: cmovsl %ebx, %edi
+; X86-NEXT: cmovsl %edx, %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: cmpl $-1, %esi
+; X86-NEXT: cmovel %ecx, %eax
+; X86-NEXT: cmovnel %edi, %esi
+; X86-NEXT: shldl $31, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: cmpl $-1, %eax
+; X86-NEXT: cmovael %ebx, %eax
+; X86-NEXT: movl $-1, %esi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: cmpl $1, %edx
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: cmovbl %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: andl %edx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: cmovel %ecx, %ebx
+; X86-NEXT: cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: cmoval %eax, %ecx
+; X86-NEXT: cmpl $-1, %ebx
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: cmovnel %edi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: cmovsl %esi, %ebx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: cmovsl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: andl %edx, %ebx
+; X86-NEXT: cmpl $-1, %ebx
+; X86-NEXT: cmovel %ecx, %eax
+; X86-NEXT: cmovnel %esi, %ebx
+; X86-NEXT: shldl $31, %eax, %ebx
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %ebx, 12(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 8(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 4(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+ %tmp = call <4 x i32> @llvm.sdiv.fix.sat.v4i32(<4 x i32> %x, <4 x i32> %y, i32 31)
+ ret <4 x i32> %tmp
+}
diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
new file mode 100644
index 000000000000..5c15bde7cc6a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
@@ -0,0 +1,528 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86
+
+declare i4 @llvm.udiv.fix.sat.i4 (i4, i4, i32)
+declare i15 @llvm.udiv.fix.sat.i15 (i15, i15, i32)
+declare i16 @llvm.udiv.fix.sat.i16 (i16, i16, i32)
+declare i18 @llvm.udiv.fix.sat.i18 (i18, i18, i32)
+declare i64 @llvm.udiv.fix.sat.i64 (i64, i64, i32)
+declare <4 x i32> @llvm.udiv.fix.sat.v4i32(<4 x i32>, <4 x i32>, i32)
+
+define i16 @func(i16 %x, i16 %y) nounwind {
+; X64-LABEL: func:
+; X64: # %bb.0:
+; X64-NEXT: movzwl %si, %ecx
+; X64-NEXT: movzwl %di, %eax
+; X64-NEXT: shll $8, %eax
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: divl %ecx
+; X64-NEXT: cmpl $131071, %eax # imm = 0x1FFFF
+; X64-NEXT: movl $131071, %ecx # imm = 0x1FFFF
+; X64-NEXT: cmovael %ecx, %eax
+; X64-NEXT: shrl %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+;
+; X86-LABEL: func:
+; X86: # %bb.0:
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movzwl %ax, %eax
+; X86-NEXT: shll $8, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: divl %ecx
+; X86-NEXT: cmpl $131071, %eax # imm = 0x1FFFF
+; X86-NEXT: movl $131071, %ecx # imm = 0x1FFFF
+; X86-NEXT: cmovael %ecx, %eax
+; X86-NEXT: shrl %eax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: retl
+ %tmp = call i16 @llvm.udiv.fix.sat.i16(i16 %x, i16 %y, i32 7)
+ ret i16 %tmp
+}
+
+define i16 @func2(i8 %x, i8 %y) nounwind {
+; X64-LABEL: func2:
+; X64: # %bb.0:
+; X64-NEXT: movsbl %dil, %eax
+; X64-NEXT: andl $32767, %eax # imm = 0x7FFF
+; X64-NEXT: movsbl %sil, %ecx
+; X64-NEXT: andl $32767, %ecx # imm = 0x7FFF
+; X64-NEXT: shll $14, %eax
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: divl %ecx
+; X64-NEXT: cmpl $32767, %eax # imm = 0x7FFF
+; X64-NEXT: movl $32767, %ecx # imm = 0x7FFF
+; X64-NEXT: cmovbl %eax, %ecx
+; X64-NEXT: addl %ecx, %ecx
+; X64-NEXT: movswl %cx, %eax
+; X64-NEXT: shrl %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+;
+; X86-LABEL: func2:
+; X86: # %bb.0:
+; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andl $32767, %eax # imm = 0x7FFF
+; X86-NEXT: shll $14, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: divl %ecx
+; X86-NEXT: cmpl $32767, %eax # imm = 0x7FFF
+; X86-NEXT: movl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT: cmovbl %eax, %ecx
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: movswl %cx, %eax
+; X86-NEXT: shrl %eax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: retl
+ %x2 = sext i8 %x to i15
+ %y2 = sext i8 %y to i15
+ %tmp = call i15 @llvm.udiv.fix.sat.i15(i15 %x2, i15 %y2, i32 14)
+ %tmp2 = sext i15 %tmp to i16
+ ret i16 %tmp2
+}
+
+define i16 @func3(i15 %x, i8 %y) nounwind {
+; X64-LABEL: func3:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: leal (%rdi,%rdi), %eax
+; X64-NEXT: movzbl %sil, %ecx
+; X64-NEXT: shll $4, %ecx
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: divw %cx
+; X64-NEXT: # kill: def $ax killed $ax def $eax
+; X64-NEXT: movzwl %ax, %ecx
+; X64-NEXT: cmpl $32767, %ecx # imm = 0x7FFF
+; X64-NEXT: movl $32767, %ecx # imm = 0x7FFF
+; X64-NEXT: cmovbl %eax, %ecx
+; X64-NEXT: addl %ecx, %ecx
+; X64-NEXT: movswl %cx, %eax
+; X64-NEXT: shrl %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+;
+; X86-LABEL: func3:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: addl %eax, %eax
+; X86-NEXT: movzbl %cl, %ecx
+; X86-NEXT: shll $4, %ecx
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: divw %cx
+; X86-NEXT: # kill: def $ax killed $ax def $eax
+; X86-NEXT: movzwl %ax, %ecx
+; X86-NEXT: cmpl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT: movl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT: cmovbl %eax, %ecx
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: movswl %cx, %eax
+; X86-NEXT: shrl %eax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: retl
+ %y2 = sext i8 %y to i15
+ %y3 = shl i15 %y2, 7
+ %tmp = call i15 @llvm.udiv.fix.sat.i15(i15 %x, i15 %y3, i32 4)
+ %tmp2 = sext i15 %tmp to i16
+ ret i16 %tmp2
+}
+
+define i4 @func4(i4 %x, i4 %y) nounwind {
+; X64-LABEL: func4:
+; X64: # %bb.0:
+; X64-NEXT: andb $15, %sil
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: shlb $2, %dil
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: divb %sil
+; X64-NEXT: movzbl %al, %ecx
+; X64-NEXT: cmpb $15, %cl
+; X64-NEXT: movl $15, %eax
+; X64-NEXT: cmovbl %ecx, %eax
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
+;
+; X86-LABEL: func4:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: andb $15, %cl
+; X86-NEXT: movb {{[0-9]+}}(%esp), %al
+; X86-NEXT: andb $15, %al
+; X86-NEXT: shlb $2, %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: divb %cl
+; X86-NEXT: movzbl %al, %ecx
+; X86-NEXT: cmpb $15, %al
+; X86-NEXT: movl $15, %eax
+; X86-NEXT: cmovbl %ecx, %eax
+; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: retl
+ %tmp = call i4 @llvm.udiv.fix.sat.i4(i4 %x, i4 %y, i32 2)
+ ret i4 %tmp
+}
+
+define i64 @func5(i64 %x, i64 %y) nounwind {
+; X64-LABEL: func5:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rbx
+; X64-NEXT: movq %rsi, %rdx
+; X64-NEXT: leaq (%rdi,%rdi), %rsi
+; X64-NEXT: shrq $33, %rsi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: andl $-2147483648, %eax # imm = 0x80000000
+; X64-NEXT: orq %rax, %rsi
+; X64-NEXT: shlq $32, %rdi
+; X64-NEXT: xorl %ebx, %ebx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: callq __udivti3
+; X64-NEXT: cmpq $-1, %rax
+; X64-NEXT: movq $-1, %rcx
+; X64-NEXT: cmovbq %rax, %rcx
+; X64-NEXT: cmpq $1, %rdx
+; X64-NEXT: movl $1, %esi
+; X64-NEXT: cmovbq %rdx, %rsi
+; X64-NEXT: sbbq %rbx, %rbx
+; X64-NEXT: notq %rbx
+; X64-NEXT: orq %rax, %rbx
+; X64-NEXT: cmpq $1, %rdx
+; X64-NEXT: cmoveq %rcx, %rbx
+; X64-NEXT: shrdq $1, %rsi, %rbx
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: popq %rbx
+; X64-NEXT: retq
+;
+; X86-LABEL: func5:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-8, %esp
+; X86-NEXT: subl $24, %esp
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: shrl %edx
+; X86-NEXT: shldl $31, %eax, %ecx
+; X86-NEXT: shll $31, %eax
+; X86-NEXT: movl %esp, %esi
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl 20(%ebp)
+; X86-NEXT: pushl 16(%ebp)
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl %edx
+; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl %eax
+; X86-NEXT: pushl %esi
+; X86-NEXT: calll __udivti3
+; X86-NEXT: addl $32, %esp
+; X86-NEXT: movl (%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: cmpl $-1, %eax
+; X86-NEXT: movl $-1, %ecx
+; X86-NEXT: movl $-1, %esi
+; X86-NEXT: cmovbl %eax, %esi
+; X86-NEXT: cmpl $-1, %edx
+; X86-NEXT: cmovel %edx, %eax
+; X86-NEXT: cmovel %esi, %eax
+; X86-NEXT: cmovael %ecx, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: orl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: cmovnel %ecx, %edx
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: leal -4(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+ %tmp = call i64 @llvm.udiv.fix.sat.i64(i64 %x, i64 %y, i32 31)
+ ret i64 %tmp
+}
+
+define i18 @func6(i16 %x, i16 %y) nounwind {
+; X64-LABEL: func6:
+; X64: # %bb.0:
+; X64-NEXT: movswl %di, %eax
+; X64-NEXT: andl $262143, %eax # imm = 0x3FFFF
+; X64-NEXT: movswl %si, %ecx
+; X64-NEXT: andl $262143, %ecx # imm = 0x3FFFF
+; X64-NEXT: shll $7, %eax
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: divl %ecx
+; X64-NEXT: cmpl $262143, %eax # imm = 0x3FFFF
+; X64-NEXT: movl $262143, %ecx # imm = 0x3FFFF
+; X64-NEXT: cmovael %ecx, %eax
+; X64-NEXT: retq
+;
+; X86-LABEL: func6:
+; X86: # %bb.0:
+; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andl $262143, %ecx # imm = 0x3FFFF
+; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andl $262143, %eax # imm = 0x3FFFF
+; X86-NEXT: shll $7, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: divl %ecx
+; X86-NEXT: cmpl $262143, %eax # imm = 0x3FFFF
+; X86-NEXT: movl $262143, %ecx # imm = 0x3FFFF
+; X86-NEXT: cmovael %ecx, %eax
+; X86-NEXT: retl
+ %x2 = sext i16 %x to i18
+ %y2 = sext i16 %y to i18
+ %tmp = call i18 @llvm.udiv.fix.sat.i18(i18 %x2, i18 %y2, i32 7)
+ ret i18 %tmp
+}
+
+define i16 @func7(i16 %x, i16 %y) nounwind {
+; X64-LABEL: func7:
+; X64: # %bb.0:
+; X64-NEXT: movzwl %si, %ecx
+; X64-NEXT: movzwl %di, %eax
+; X64-NEXT: addl %eax, %eax
+; X64-NEXT: shlq $16, %rax
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: divq %rcx
+; X64-NEXT: cmpq $131071, %rax # imm = 0x1FFFF
+; X64-NEXT: movl $131071, %ecx # imm = 0x1FFFF
+; X64-NEXT: cmovaeq %rcx, %rax
+; X64-NEXT: shrl %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $rax
+; X64-NEXT: retq
+;
+; X86-LABEL: func7:
+; X86: # %bb.0:
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl %cx, %ecx
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: shrl $16, %edx
+; X86-NEXT: shll $16, %ecx
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl %eax
+; X86-NEXT: pushl %edx
+; X86-NEXT: pushl %ecx
+; X86-NEXT: calll __udivdi3
+; X86-NEXT: addl $16, %esp
+; X86-NEXT: cmpl $131071, %eax # imm = 0x1FFFF
+; X86-NEXT: movl $131071, %ecx # imm = 0x1FFFF
+; X86-NEXT: cmovael %ecx, %eax
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: shrl %eax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: retl
+ %tmp = call i16 @llvm.udiv.fix.sat.i16(i16 %x, i16 %y, i32 16)
+ ret i16 %tmp
+}
+
+define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
+; X64-LABEL: vec:
+; X64: # %bb.0:
+; X64-NEXT: pxor %xmm8, %xmm8
+; X64-NEXT: movdqa %xmm1, %xmm2
+; X64-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; X64-NEXT: movq %xmm2, %rcx
+; X64-NEXT: movdqa %xmm0, %xmm4
+; X64-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3]
+; X64-NEXT: paddq %xmm4, %xmm4
+; X64-NEXT: psllq $31, %xmm4
+; X64-NEXT: movq %xmm4, %rax
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: divq %rcx
+; X64-NEXT: movq %rax, %xmm7
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; X64-NEXT: movq %xmm2, %rcx
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
+; X64-NEXT: movq %xmm2, %rax
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: divq %rcx
+; X64-NEXT: movq %rax, %xmm2
+; X64-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm2[0]
+; X64-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; X64-NEXT: movdqa %xmm7, %xmm2
+; X64-NEXT: pxor %xmm4, %xmm2
+; X64-NEXT: movdqa {{.*#+}} xmm9 = [9223372043297226751,9223372043297226751]
+; X64-NEXT: movdqa %xmm9, %xmm6
+; X64-NEXT: pcmpgtd %xmm2, %xmm6
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
+; X64-NEXT: pcmpeqd %xmm9, %xmm2
+; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; X64-NEXT: pand %xmm3, %xmm5
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; X64-NEXT: por %xmm5, %xmm2
+; X64-NEXT: movdqa {{.*#+}} xmm6 = [8589934591,8589934591]
+; X64-NEXT: pand %xmm2, %xmm7
+; X64-NEXT: pandn %xmm6, %xmm2
+; X64-NEXT: por %xmm7, %xmm2
+; X64-NEXT: psrlq $1, %xmm2
+; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
+; X64-NEXT: movq %xmm1, %rcx
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
+; X64-NEXT: paddq %xmm0, %xmm0
+; X64-NEXT: psllq $31, %xmm0
+; X64-NEXT: movq %xmm0, %rax
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: divq %rcx
+; X64-NEXT: movq %rax, %xmm3
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X64-NEXT: movq %xmm1, %rcx
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-NEXT: movq %xmm0, %rax
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: divq %rcx
+; X64-NEXT: movq %rax, %xmm0
+; X64-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; X64-NEXT: pxor %xmm3, %xmm4
+; X64-NEXT: movdqa %xmm9, %xmm0
+; X64-NEXT: pcmpgtd %xmm4, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; X64-NEXT: pcmpeqd %xmm9, %xmm4
+; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X64-NEXT: pand %xmm1, %xmm4
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT: por %xmm4, %xmm0
+; X64-NEXT: pand %xmm0, %xmm3
+; X64-NEXT: pandn %xmm6, %xmm0
+; X64-NEXT: por %xmm3, %xmm0
+; X64-NEXT: psrlq $1, %xmm0
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; X64-NEXT: retq
+;
+; X86-LABEL: vec:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: subl $16, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: setb %al
+; X86-NEXT: shldl $31, %ecx, %eax
+; X86-NEXT: shll $31, %ecx
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl %eax
+; X86-NEXT: pushl %ecx
+; X86-NEXT: calll __udivdi3
+; X86-NEXT: addl $16, %esp
+; X86-NEXT: cmpl $-1, %eax
+; X86-NEXT: movl $-1, %ecx
+; X86-NEXT: cmovbl %eax, %ecx
+; X86-NEXT: cmpl $1, %edx
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: notl %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: addl %esi, %esi
+; X86-NEXT: setb %al
+; X86-NEXT: cmpl $1, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: cmovel %ecx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $1, %ecx
+; X86-NEXT: cmovael %ecx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $31, %esi, %eax
+; X86-NEXT: shll $31, %esi
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl %eax
+; X86-NEXT: pushl %esi
+; X86-NEXT: calll __udivdi3
+; X86-NEXT: addl $16, %esp
+; X86-NEXT: cmpl $-1, %eax
+; X86-NEXT: movl $-1, %ecx
+; X86-NEXT: cmovbl %eax, %ecx
+; X86-NEXT: cmpl $1, %edx
+; X86-NEXT: movl $1, %esi
+; X86-NEXT: cmovbl %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: addl %edi, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: cmpl $1, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: cmovel %ecx, %esi
+; X86-NEXT: shldl $31, %edi, %eax
+; X86-NEXT: shll $31, %edi
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl %eax
+; X86-NEXT: pushl %edi
+; X86-NEXT: calll __udivdi3
+; X86-NEXT: addl $16, %esp
+; X86-NEXT: cmpl $-1, %eax
+; X86-NEXT: movl $-1, %ebx
+; X86-NEXT: cmovbl %eax, %ebx
+; X86-NEXT: cmpl $1, %edx
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: notl %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: addl %ebp, %ebp
+; X86-NEXT: setb %cl
+; X86-NEXT: cmpl $1, %edx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl $1, %edx
+; X86-NEXT: cmovael %edx, %eax
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: cmovel %ebx, %edi
+; X86-NEXT: shldl $31, %ebp, %ecx
+; X86-NEXT: shll $31, %ebp
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl %ebp
+; X86-NEXT: calll __udivdi3
+; X86-NEXT: addl $16, %esp
+; X86-NEXT: cmpl $-1, %eax
+; X86-NEXT: movl $-1, %ecx
+; X86-NEXT: cmovbl %eax, %ecx
+; X86-NEXT: cmpl $1, %edx
+; X86-NEXT: movl $1, %ebx
+; X86-NEXT: cmovbl %edx, %ebx
+; X86-NEXT: movl $0, %ebp
+; X86-NEXT: sbbl %ebp, %ebp
+; X86-NEXT: notl %ebp
+; X86-NEXT: orl %eax, %ebp
+; X86-NEXT: cmpl $1, %edx
+; X86-NEXT: cmovel %ecx, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shrdl $1, %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shrdl $1, %eax, %esi
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: shrdl $1, %eax, %edi
+; X86-NEXT: shrdl $1, %ebx, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %ebp, 12(%eax)
+; X86-NEXT: movl %edi, 8(%eax)
+; X86-NEXT: movl %esi, 4(%eax)
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: addl $16, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+ %tmp = call <4 x i32> @llvm.udiv.fix.sat.v4i32(<4 x i32> %x, <4 x i32> %y, i32 31)
+ ret <4 x i32> %tmp
+}
More information about the llvm-commits
mailing list