[llvm] r361289 - [Intrinsic] Signed Fixed Point Saturation Multiplication Intrinsic

Tue May 21 12:17:19 PDT 2019

Author: leonardchan
Date: Tue May 21 12:17:19 2019
New Revision: 361289

URL: http://llvm.org/viewvc/llvm-project?rev=361289&view=rev
Log:
[Intrinsic] Signed Fixed Point Saturation Multiplication Intrinsic

Add an intrinsic that takes 2 signed integers with the scale of them provided
as the third argument and performs fixed point multiplication on them. The
result is saturated and clamped between the largest and smallest representable
values of the first 2 operands.

This is a part of implementing fixed point arithmetic in clang where some of
the more complex operations will be implemented as intrinsics.

Differential Revision: https://reviews.llvm.org/D55720

Added:
    llvm/trunk/test/CodeGen/X86/smul_fix_sat.ll
    llvm/trunk/test/CodeGen/X86/smul_fix_sat_constants.ll
Modified:
    llvm/trunk/docs/LangRef.rst
    llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h
    llvm/trunk/include/llvm/CodeGen/TargetLowering.h
    llvm/trunk/include/llvm/IR/Intrinsics.td
    llvm/trunk/include/llvm/Target/TargetSelectionDAG.td
    llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
    llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
    llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
    llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
    llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
    llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
    llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp
    llvm/trunk/lib/IR/Verifier.cpp

Modified: llvm/trunk/docs/LangRef.rst
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/docs/LangRef.rst?rev=361289&r1=361288&r2=361289&view=diff
==============================================================================

--- llvm/trunk/docs/LangRef.rst (original)
+++ llvm/trunk/docs/LangRef.rst Tue May 21 12:17:19 2019
@@ -13278,6 +13278,31 @@ are useful for representing fractional v
 following intrinsics perform fixed point arithmetic operations on 2 operands
 of the same scale, specified as the third argument.
 
+The `llvm.*mul.fix` family of intrinsic functions represents a multiplication
+of fixed point numbers through scaled integers. Therefore, fixed point
+multplication can be represented as
+
+::
+        %result = call i4 @llvm.smul.fix.i4(i4 %a, i4 %b, i32 %scale)
+          =>
+        %a2 = sext i4 %a to i8
+        %b2 = sext i4 %b to i8
+        %mul = mul nsw nuw i8 %a, %b
+        %scale2 = trunc i32 %scale to i8
+        %r = ashr i8 %mul, i8 %scale2  ; this is for a target rounding down towards negative infinity
+        %result = trunc i8 %r to i4
+
+For each of these functions, if the result cannot be represented exactly with
+the provided scale, the result is rounded. Rounding is unspecified since
+preferred rounding may vary for different targets. Rounding is specified
+through a target hook. Different pipelines should legalize or optimize this
+using the rounding specified by this hook if it is provided. Operations like
+constant folding, instruction combining, KnownBits, and ValueTracking should
+also use this hook, if provided, and not assume the direction of rounding. A
+rounded result must always be within one unit of precision from the true
+result. That is, the error between the returned result and the true result must
+be less than 1/2^(scale).
+
 
 '``llvm.smul.fix.*``' Intrinsics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -13398,6 +13423,76 @@ Examples
       %res = call i4 @llvm.umul.fix.i4(i4 15, i4 1, i32 1)  ; %res = 7 (or 8) (7.5 x 0.5 = 3.75)
 
 
+'``llvm.smul.fix.sat.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.smul.fix.sat``
+on any integer bit width or vectors of integers.
+
+::
+
+      declare i16 @llvm.smul.fix.sat.i16(i16 %a, i16 %b, i32 %scale)
+      declare i32 @llvm.smul.fix.sat.i32(i32 %a, i32 %b, i32 %scale)
+      declare i64 @llvm.smul.fix.sat.i64(i64 %a, i64 %b, i32 %scale)
+      declare <4 x i32> @llvm.smul.fix.sat.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %scale)
+
+Overview
+"""""""""
+
+The '``llvm.smul.fix.sat``' family of intrinsic functions perform signed
+fixed point saturation multiplication on 2 arguments of the same scale.
+
+Arguments
+""""""""""
+
+The arguments (%a and %b) and the result may be of integer types of any bit
+width, but they must have the same bit width. ``%a`` and ``%b`` are the two
+values that will undergo signed fixed point multiplication. The argument
+``%scale`` represents the scale of both operands, and must be a constant
+integer.
+
+Semantics:
+""""""""""
+
+This operation performs fixed point multiplication on the 2 arguments of a
+specified scale. The result will also be returned in the same scale specified
+in the third argument.
+
+If the result value cannot be precisely represented in the given scale, the
+value is rounded up or down to the closest representable value. The rounding
+direction is unspecified.
+
+The maximum value this operation can clamp to is the largest signed value
+representable by the bit width of the first 2 arguments. The minimum value is the
+smallest signed value representable by this bit width.
+
+
+Examples
+"""""""""
+
+.. code-block:: llvm
+
+      %res = call i4 @llvm.smul.fix.sat.i4(i4 3, i4 2, i32 0)  ; %res = 6 (2 x 3 = 6)
+      %res = call i4 @llvm.smul.fix.sat.i4(i4 3, i4 2, i32 1)  ; %res = 3 (1.5 x 1 = 1.5)
+      %res = call i4 @llvm.smul.fix.sat.i4(i4 3, i4 -2, i32 1)  ; %res = -3 (1.5 x -1 = -1.5)
+
+      ; The result in the following could be rounded up to -2 or down to -2.5
+      %res = call i4 @llvm.smul.fix.sat.i4(i4 3, i4 -3, i32 1)  ; %res = -5 (or -4) (1.5 x -1.5 = -2.25)
+
+      ; Saturation
+      %res = call i4 @llvm.smul.fix.sat.i4(i4 7, i4 2, i32 0)  ; %res = 7
+      %res = call i4 @llvm.smul.fix.sat.i4(i4 7, i4 2, i32 2)  ; %res = 7
+      %res = call i4 @llvm.smul.fix.sat.i4(i4 -8, i4 2, i32 2)  ; %res = -8
+      %res = call i4 @llvm.smul.fix.sat.i4(i4 -8, i4 -2, i32 2)  ; %res = 7
+
+      ; Scale can affect the saturation result
+      %res = call i4 @llvm.smul.fix.sat.i4(i4 2, i4 4, i32 0)  ; %res = 7 (2 x 4 -> clamped to 7)
+      %res = call i4 @llvm.smul.fix.sat.i4(i4 2, i4 4, i32 1)  ; %res = 4 (1 x 2 = 2)
+
+
 Specialised Arithmetic Intrinsics
 ---------------------------------
 

Modified: llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h?rev=361289&r1=361288&r2=361289&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h (original)
+++ llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h Tue May 21 12:17:19 2019
@@ -278,6 +278,11 @@ namespace ISD {
     /// multiplication on 2 integers.
     SMULFIX, UMULFIX,
 
+    /// Same as the corresponding unsaturated fixed point instructions, but the
+    /// result is clamped between the min and max values representable by the
+    /// bits of the first 2 operands.
+    SMULFIXSAT,
+
     /// Simple binary floating point operators.
     FADD, FSUB, FMUL, FDIV, FREM,
 

Modified: llvm/trunk/include/llvm/CodeGen/TargetLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/TargetLowering.h?rev=361289&r1=361288&r2=361289&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/TargetLowering.h (original)
+++ llvm/trunk/include/llvm/CodeGen/TargetLowering.h Tue May 21 12:17:19 2019
@@ -855,6 +855,7 @@ public:
     default:
       llvm_unreachable("Unexpected fixed point operation.");
     case ISD::SMULFIX:
+    case ISD::SMULFIXSAT:
     case ISD::UMULFIX:
       Supported = isSupportedFixedPointOperation(Op, VT, Scale);
       break;

Modified: llvm/trunk/include/llvm/IR/Intrinsics.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/Intrinsics.td?rev=361289&r1=361288&r2=361289&view=diff
==============================================================================
--- llvm/trunk/include/llvm/IR/Intrinsics.td (original)
+++ llvm/trunk/include/llvm/IR/Intrinsics.td Tue May 21 12:17:19 2019
@@ -874,6 +874,12 @@ def int_umul_fix : Intrinsic<[llvm_anyin
                              [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
                              [IntrNoMem, IntrSpeculatable, Commutative, ImmArg<2>]>;
 
+//===------------------- Fixed Point Saturation Arithmetic Intrinsics ----------------===//
+//
+def int_smul_fix_sat : Intrinsic<[llvm_anyint_ty],
+                                 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
+                                 [IntrNoMem, IntrSpeculatable, Commutative, ImmArg<2>]>;
+
 //===------------------------- Memory Use Markers -------------------------===//
 //
 def int_lifetime_start  : Intrinsic<[],

Modified: llvm/trunk/include/llvm/Target/TargetSelectionDAG.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Target/TargetSelectionDAG.td?rev=361289&r1=361288&r2=361289&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Target/TargetSelectionDAG.td (original)
+++ llvm/trunk/include/llvm/Target/TargetSelectionDAG.td Tue May 21 12:17:19 2019
@@ -391,6 +391,7 @@ def ssubsat    : SDNode<"ISD::SSUBSAT"
 def usubsat    : SDNode<"ISD::USUBSAT"   , SDTIntBinOp>;
 
 def smulfix    : SDNode<"ISD::SMULFIX"   , SDTIntScaledBinOp, [SDNPCommutative]>;
+def smulfixsat : SDNode<"ISD::SMULFIXSAT", SDTIntScaledBinOp, [SDNPCommutative]>;
 def umulfix    : SDNode<"ISD::UMULFIX"   , SDTIntScaledBinOp, [SDNPCommutative]>;
 
 def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>;

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp?rev=361289&r1=361288&r2=361289&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp Tue May 21 12:17:19 2019
@@ -1140,6 +1140,7 @@ void SelectionDAGLegalize::LegalizeOp(SD
     break;
   }
   case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
   case ISD::UMULFIX: {
     unsigned Scale = Node->getConstantOperandVal(2);
     Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
@@ -3334,6 +3335,7 @@ bool SelectionDAGLegalize::ExpandNode(SD
     Results.push_back(TLI.expandAddSubSat(Node, DAG));
     break;
   case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
   case ISD::UMULFIX:
     Results.push_back(TLI.expandFixedPointMul(Node, DAG));
     break;

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp?rev=361289&r1=361288&r2=361289&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp Tue May 21 12:17:19 2019
@@ -149,6 +149,7 @@ void DAGTypeLegalizer::PromoteIntegerRes
   case ISD::SSUBSAT:
   case ISD::USUBSAT:     Res = PromoteIntRes_ADDSUBSAT(N); break;
   case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
   case ISD::UMULFIX:     Res = PromoteIntRes_MULFIX(N); break;
   case ISD::ABS:         Res = PromoteIntRes_ABS(N); break;
 
@@ -670,14 +671,35 @@ SDValue DAGTypeLegalizer::PromoteIntRes_
   // Can just promote the operands then continue with operation.
   SDLoc dl(N);
   SDValue Op1Promoted, Op2Promoted;
-  if (N->getOpcode() == ISD::SMULFIX) {
+  bool Signed =
+      N->getOpcode() == ISD::SMULFIX || N->getOpcode() == ISD::SMULFIXSAT;
+  if (Signed) {
     Op1Promoted = SExtPromotedInteger(N->getOperand(0));
     Op2Promoted = SExtPromotedInteger(N->getOperand(1));
   } else {
     Op1Promoted = ZExtPromotedInteger(N->getOperand(0));
     Op2Promoted = ZExtPromotedInteger(N->getOperand(1));
   }
+  EVT OldType = N->getOperand(0).getValueType();
   EVT PromotedType = Op1Promoted.getValueType();
+  unsigned DiffSize =
+      PromotedType.getScalarSizeInBits() - OldType.getScalarSizeInBits();
+
+  bool Saturating = N->getOpcode() == ISD::SMULFIXSAT;
+  if (Saturating) {
+    // Promoting the operand and result values changes the saturation width,
+    // which is extends the values that we clamp to on saturation. This could be
+    // resolved by shifting one of the operands the same amount, which would
+    // also shift the result we compare against, then shifting back.
+    EVT ShiftTy = TLI.getShiftAmountTy(PromotedType, DAG.getDataLayout());
+    Op1Promoted = DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted,
+                              DAG.getConstant(DiffSize, dl, ShiftTy));
+    SDValue Result = DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted,
+                                 Op2Promoted, N->getOperand(2));
+    unsigned ShiftOp = Signed ? ISD::SRA : ISD::SRL;
+    return DAG.getNode(ShiftOp, dl, PromotedType, Result,
+                       DAG.getConstant(DiffSize, dl, ShiftTy));
+  }
   return DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted, Op2Promoted,
                      N->getOperand(2));
 }
@@ -1125,6 +1147,7 @@ bool DAGTypeLegalizer::PromoteIntegerOpe
   case ISD::PREFETCH: Res = PromoteIntOp_PREFETCH(N, OpNo); break;
 
   case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
   case ISD::UMULFIX: Res = PromoteIntOp_MULFIX(N); break;
 
   case ISD::FPOWI: Res = PromoteIntOp_FPOWI(N); break;
@@ -1688,7 +1711,9 @@ void DAGTypeLegalizer::ExpandIntegerResu
   case ISD::UADDSAT:
   case ISD::SSUBSAT:
   case ISD::USUBSAT: ExpandIntRes_ADDSUBSAT(N, Lo, Hi); break;
+
   case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
   case ISD::UMULFIX: ExpandIntRes_MULFIX(N, Lo, Hi); break;
 
   case ISD::VECREDUCE_ADD:
@@ -2712,19 +2737,40 @@ void DAGTypeLegalizer::ExpandIntRes_ADDS
   SplitInteger(Result, Lo, Hi);
 }
 
+/// This performs an expansion of the integer result for a fixed point
+/// multiplication. The default expansion performs rounding down towards
+/// negative infinity, though targets that do care about rounding should specify
+/// a target hook for rounding and provide their own expansion or lowering of
+/// fixed point multiplication to be consistent with rounding.
 void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
-  assert(
-      (N->getOpcode() == ISD::SMULFIX || N->getOpcode() == ISD::UMULFIX) &&
-      "Expected operand to be signed or unsigned fixed point multiplication");
-
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
+  unsigned VTSize = VT.getScalarSizeInBits();
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   uint64_t Scale = N->getConstantOperandVal(2);
+  bool Saturating = N->getOpcode() == ISD::SMULFIXSAT;
+  EVT BoolVT =
+      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  SDValue Zero = DAG.getConstant(0, dl, VT);
   if (!Scale) {
-    SDValue Result = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
+    SDValue Result;
+    if (!Saturating) {
+      Result = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
+    } else {
+      Result = DAG.getNode(ISD::SMULO, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
+      SDValue Product = Result.getValue(0);
+      SDValue Overflow = Result.getValue(1);
+
+      APInt MinVal = APInt::getSignedMinValue(VTSize);
+      APInt MaxVal = APInt::getSignedMaxValue(VTSize);
+      SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
+      SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
+      SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Product, Zero, ISD::SETLT);
+      Result = DAG.getSelect(dl, VT, ProdNeg, SatMax, SatMin);
+      Result = DAG.getSelect(dl, VT, Overflow, Result, Product);
+    }
     SplitInteger(Result, Lo, Hi);
     return;
   }
@@ -2735,7 +2781,8 @@ void DAGTypeLegalizer::ExpandIntRes_MULF
   GetExpandedInteger(RHS, RL, RH);
   SmallVector<SDValue, 4> Result;
 
-  bool Signed = N->getOpcode() == ISD::SMULFIX;
+  bool Signed = (N->getOpcode() == ISD::SMULFIX ||
+                 N->getOpcode() == ISD::SMULFIXSAT);
   unsigned LoHiOp = Signed ? ISD::SMUL_LOHI : ISD::UMUL_LOHI;
   if (!TLI.expandMUL_LOHI(LoHiOp, VT, dl, LHS, RHS, Result, NVT, DAG,
                           TargetLowering::MulExpansionKind::OnlyLegalOrCustom,
@@ -2744,8 +2791,9 @@ void DAGTypeLegalizer::ExpandIntRes_MULF
     return;
   }
 
-  unsigned VTSize = VT.getScalarSizeInBits();
   unsigned NVTSize = NVT.getScalarSizeInBits();
+  assert((VTSize == NVTSize * 2) && "Expected the new value type to be half "
+                                    "the size of the current value type");
   EVT ShiftTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
 
   // Shift whole amount by scale.
@@ -2754,6 +2802,12 @@ void DAGTypeLegalizer::ExpandIntRes_MULF
   SDValue ResultHL = Result[2];
   SDValue ResultHH = Result[3];
 
+  SDValue SatMax, SatMin;
+  SDValue NVTZero = DAG.getConstant(0, dl, NVT);
+  SDValue NVTNeg1 = DAG.getConstant(-1, dl, NVT);
+  EVT BoolNVT =
+      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), NVT);
+
   // After getting the multplication result in 4 parts, we need to perform a
   // shift right by the amount of the scale to get the result in that scale.
   // Let's say we multiply 2 64 bit numbers. The resulting value can be held in
@@ -2782,11 +2836,60 @@ void DAGTypeLegalizer::ExpandIntRes_MULF
     Hi = DAG.getNode(ISD::SRL, dl, NVT, ResultLH, SRLAmnt);
     Hi = DAG.getNode(ISD::OR, dl, NVT, Hi,
                      DAG.getNode(ISD::SHL, dl, NVT, ResultHL, SHLAmnt));
+
+    // We cannot overflow past HH when multiplying 2 ints of size VTSize, so the
+    // highest bit of HH determines saturation direction in the event of
+    // saturation.
+    // The number of overflow bits we can check are VTSize - Scale + 1 (we
+    // include the sign bit). If these top bits are > 0, then we overflowed past
+    // the max value. If these top bits are < -1, then we overflowed past the
+    // min value. Otherwise, we did not overflow.
+    if (Saturating) {
+      unsigned OverflowBits = VTSize - Scale + 1;
+      assert(OverflowBits <= VTSize && OverflowBits > NVTSize &&
+             "Extent of overflow bits must start within HL");
+      SDValue HLHiMask = DAG.getConstant(
+          APInt::getHighBitsSet(NVTSize, OverflowBits - NVTSize), dl, NVT);
+      SDValue HLLoMask = DAG.getConstant(
+          APInt::getLowBitsSet(NVTSize, VTSize - OverflowBits), dl, NVT);
+
+      // HH > 0 or HH == 0 && HL > HLLoMask
+      SDValue HHPos = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETGT);
+      SDValue HHZero = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETEQ);
+      SDValue HLPos =
+          DAG.getSetCC(dl, BoolNVT, ResultHL, HLLoMask, ISD::SETUGT);
+      SatMax = DAG.getNode(ISD::OR, dl, BoolNVT, HHPos,
+                           DAG.getNode(ISD::AND, dl, BoolNVT, HHZero, HLPos));
+
+      // HH < -1 or HH == -1 && HL < HLHiMask
+      SDValue HHNeg = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETLT);
+      SDValue HHNeg1 = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETEQ);
+      SDValue HLNeg =
+          DAG.getSetCC(dl, BoolNVT, ResultHL, HLHiMask, ISD::SETULT);
+      SatMin = DAG.getNode(ISD::OR, dl, BoolNVT, HHNeg,
+                           DAG.getNode(ISD::AND, dl, BoolNVT, HHNeg1, HLNeg));
+    }
   } else if (Scale == NVTSize) {
     // If the scales are equal, Lo and Hi are ResultLH and Result HL,
     // respectively. Avoid shifting to prevent undefined behavior.
     Lo = ResultLH;
     Hi = ResultHL;
+
+    // We overflow max if HH > 0 or HH == 0 && HL sign is negative.
+    // We overflow min if HH < -1 or HH == -1 && HL sign is 0.
+    if (Saturating) {
+      SDValue HHPos = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETGT);
+      SDValue HHZero = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETEQ);
+      SDValue HLNeg = DAG.getSetCC(dl, BoolNVT, ResultHL, NVTZero, ISD::SETLT);
+      SatMax = DAG.getNode(ISD::OR, dl, BoolNVT, HHPos,
+                           DAG.getNode(ISD::AND, dl, BoolNVT, HHZero, HLNeg));
+
+      SDValue HHNeg = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETLT);
+      SDValue HHNeg1 = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETEQ);
+      SDValue HLPos = DAG.getSetCC(dl, BoolNVT, ResultHL, NVTZero, ISD::SETGT);
+      SatMin = DAG.getNode(ISD::OR, dl, BoolNVT, HHNeg,
+                           DAG.getNode(ISD::AND, dl, BoolNVT, HHNeg1, HLPos));
+    }
   } else if (Scale < VTSize) {
     // If the scale is instead less than the old VT size, but greater than or
     // equal to the expanded VT size, the first part of the result (ResultLL) is
@@ -2801,6 +2904,19 @@ void DAGTypeLegalizer::ExpandIntRes_MULF
     Hi = DAG.getNode(ISD::SRL, dl, NVT, ResultHL, SRLAmnt);
     Hi = DAG.getNode(ISD::OR, dl, NVT, Hi,
                      DAG.getNode(ISD::SHL, dl, NVT, ResultHH, SHLAmnt));
+
+    // This is similar to the case when we saturate if Scale < NVTSize, but we
+    // only need to chech HH.
+    if (Saturating) {
+      unsigned OverflowBits = VTSize - Scale + 1;
+      SDValue HHHiMask = DAG.getConstant(
+          APInt::getHighBitsSet(NVTSize, OverflowBits), dl, NVT);
+      SDValue HHLoMask = DAG.getConstant(
+          APInt::getLowBitsSet(NVTSize, NVTSize - OverflowBits), dl, NVT);
+
+      SatMax = DAG.getSetCC(dl, BoolNVT, ResultHH, HHLoMask, ISD::SETGT);
+      SatMin = DAG.getSetCC(dl, BoolNVT, ResultHH, HHHiMask, ISD::SETLT);
+    }
   } else if (Scale == VTSize) {
     assert(
         !Signed &&
@@ -2812,6 +2928,16 @@ void DAGTypeLegalizer::ExpandIntRes_MULF
     llvm_unreachable("Expected the scale to be less than or equal to the width "
                      "of the operands");
   }
+
+  if (Saturating) {
+    APInt LHMax = APInt::getSignedMaxValue(NVTSize);
+    APInt LLMax = APInt::getAllOnesValue(NVTSize);
+    APInt LHMin = APInt::getSignedMinValue(NVTSize);
+    Hi = DAG.getSelect(dl, NVT, SatMax, DAG.getConstant(LHMax, dl, NVT), Hi);
+    Hi = DAG.getSelect(dl, NVT, SatMin, DAG.getConstant(LHMin, dl, NVT), Hi);
+    Lo = DAG.getSelect(dl, NVT, SatMax, DAG.getConstant(LLMax, dl, NVT), Lo);
+    Lo = DAG.getSelect(dl, NVT, SatMin, NVTZero, Lo);
+  }
 }
 
 void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp?rev=361289&r1=361288&r2=361289&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp Tue May 21 12:17:19 2019
@@ -438,6 +438,7 @@ SDValue VectorLegalizer::LegalizeOp(SDVa
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     break;
   case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
   case ISD::UMULFIX: {
     unsigned Scale = Node->getConstantOperandVal(2);
     Action = TLI.getFixedPointOperationAction(Node->getOpcode(),

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp?rev=361289&r1=361288&r2=361289&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp Tue May 21 12:17:19 2019
@@ -183,6 +183,7 @@ void DAGTypeLegalizer::ScalarizeVectorRe
     R = ScalarizeVecRes_OverflowOp(N, ResNo);
     break;
   case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
   case ISD::UMULFIX:
     R = ScalarizeVecRes_MULFIX(N);
     break;
@@ -971,6 +972,7 @@ void DAGTypeLegalizer::SplitVectorResult
     SplitVecRes_OverflowOp(N, ResNo, Lo, Hi);
     break;
   case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
   case ISD::UMULFIX:
     SplitVecRes_MULFIX(N, Lo, Hi);
     break;

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp?rev=361289&r1=361288&r2=361289&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp Tue May 21 12:17:19 2019
@@ -6298,6 +6298,14 @@ void SelectionDAGBuilder::visitIntrinsic
                              Op1.getValueType(), Op1, Op2, Op3));
     return;
   }
+  case Intrinsic::smul_fix_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    setValue(&I, DAG.getNode(ISD::SMULFIXSAT, sdl, Op1.getValueType(), Op1, Op2,
+                             Op3));
+    return;
+  }
   case Intrinsic::stacksave: {
     SDValue Op = getRoot();
     Res = DAG.getNode(

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp?rev=361289&r1=361288&r2=361289&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp Tue May 21 12:17:19 2019
@@ -301,7 +301,9 @@ std::string SDNode::getOperationName(con
   case ISD::UADDSAT:                    return "uaddsat";
   case ISD::SSUBSAT:                    return "ssubsat";
   case ISD::USUBSAT:                    return "usubsat";
+
   case ISD::SMULFIX:                    return "smulfix";
+  case ISD::SMULFIXSAT:                 return "smulfixsat";
   case ISD::UMULFIX:                    return "umulfix";
 
   // Conversion operators.

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp?rev=361289&r1=361288&r2=361289&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp Tue May 21 12:17:19 2019
@@ -5695,25 +5695,42 @@ SDValue TargetLowering::expandAddSubSat(
 SDValue
 TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
   assert((Node->getOpcode() == ISD::SMULFIX ||
-          Node->getOpcode() == ISD::UMULFIX) &&
-         "Expected opcode to be SMULFIX or UMULFIX.");
+          Node->getOpcode() == ISD::UMULFIX ||
+          Node->getOpcode() == ISD::SMULFIXSAT) &&
+         "Expected a fixed point multiplication opcode");
 
   SDLoc dl(Node);
   SDValue LHS = Node->getOperand(0);
   SDValue RHS = Node->getOperand(1);
   EVT VT = LHS.getValueType();
   unsigned Scale = Node->getConstantOperandVal(2);
+  bool Saturating = Node->getOpcode() == ISD::SMULFIXSAT;
+  EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  unsigned VTSize = VT.getScalarSizeInBits();
 
-  // [us]mul.fix(a, b, 0) -> mul(a, b)
   if (!Scale) {
-    if (VT.isVector() && !isOperationLegalOrCustom(ISD::MUL, VT))
-      return SDValue();
-    return DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
+    // [us]mul.fix(a, b, 0) -> mul(a, b)
+    if (!Saturating && isOperationLegalOrCustom(ISD::MUL, VT)) {
+      return DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
+    } else if (Saturating && isOperationLegalOrCustom(ISD::SMULO, VT)) {
+      SDValue Result =
+          DAG.getNode(ISD::SMULO, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
+      SDValue Product = Result.getValue(0);
+      SDValue Overflow = Result.getValue(1);
+      SDValue Zero = DAG.getConstant(0, dl, VT);
+
+      APInt MinVal = APInt::getSignedMinValue(VTSize);
+      APInt MaxVal = APInt::getSignedMaxValue(VTSize);
+      SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
+      SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
+      SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Product, Zero, ISD::SETLT);
+      Result = DAG.getSelect(dl, VT, ProdNeg, SatMax, SatMin);
+      return DAG.getSelect(dl, VT, Overflow, Result, Product);
+    }
   }
 
-  unsigned VTSize = VT.getScalarSizeInBits();
-  bool Signed = Node->getOpcode() == ISD::SMULFIX;
-
+  bool Signed =
+      Node->getOpcode() == ISD::SMULFIX || Node->getOpcode() == ISD::SMULFIXSAT;
   assert(((Signed && Scale < VTSize) || (!Signed && Scale <= VTSize)) &&
          "Expected scale to be less than the number of bits if signed or at "
          "most the number of bits if unsigned.");
@@ -5746,8 +5763,25 @@ TargetLowering::expandFixedPointMul(SDNo
   // are scaled. The result is given to us in 2 halves, so we only want part of
   // both in the result.
   EVT ShiftTy = getShiftAmountTy(VT, DAG.getDataLayout());
-  return DAG.getNode(ISD::FSHR, dl, VT, Hi, Lo,
-                     DAG.getConstant(Scale, dl, ShiftTy));
+  SDValue Result = DAG.getNode(ISD::FSHR, dl, VT, Hi, Lo,
+                               DAG.getConstant(Scale, dl, ShiftTy));
+  if (!Saturating)
+    return Result;
+
+  unsigned OverflowBits = VTSize - Scale + 1; // +1 for the sign
+  SDValue HiMask =
+      DAG.getConstant(APInt::getHighBitsSet(VTSize, OverflowBits), dl, VT);
+  SDValue LoMask = DAG.getConstant(
+      APInt::getLowBitsSet(VTSize, VTSize - OverflowBits), dl, VT);
+  APInt MaxVal = APInt::getSignedMaxValue(VTSize);
+  APInt MinVal = APInt::getSignedMinValue(VTSize);
+
+  Result = DAG.getSelectCC(dl, Hi, LoMask,
+                           DAG.getConstant(MaxVal, dl, VT), Result,
+                           ISD::SETGT);
+  return DAG.getSelectCC(dl, Hi, HiMask,
+                         DAG.getConstant(MinVal, dl, VT), Result,
+                         ISD::SETLT);
 }
 
 void TargetLowering::expandUADDSUBO(

Modified: llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp?rev=361289&r1=361288&r2=361289&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp (original)
+++ llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp Tue May 21 12:17:19 2019
@@ -623,6 +623,7 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::SSUBSAT, VT, Expand);
     setOperationAction(ISD::USUBSAT, VT, Expand);
     setOperationAction(ISD::SMULFIX, VT, Expand);
+    setOperationAction(ISD::SMULFIXSAT, VT, Expand);
     setOperationAction(ISD::UMULFIX, VT, Expand);
 
     // Overflow operations default to expand

Modified: llvm/trunk/lib/IR/Verifier.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/IR/Verifier.cpp?rev=361289&r1=361288&r2=361289&view=diff
==============================================================================
--- llvm/trunk/lib/IR/Verifier.cpp (original)
+++ llvm/trunk/lib/IR/Verifier.cpp Tue May 21 12:17:19 2019
@@ -4595,27 +4595,28 @@ void Verifier::visitIntrinsicCall(Intrin
     break;
   }
   case Intrinsic::smul_fix:
+  case Intrinsic::smul_fix_sat:
   case Intrinsic::umul_fix: {
     Value *Op1 = Call.getArgOperand(0);
     Value *Op2 = Call.getArgOperand(1);
     Assert(Op1->getType()->isIntOrIntVectorTy(),
-           "first operand of [us]mul_fix must be an int type or vector "
+           "first operand of [us]mul_fix[_sat] must be an int type or vector "
            "of ints");
     Assert(Op2->getType()->isIntOrIntVectorTy(),
-           "second operand of [us]mul_fix must be an int type or vector "
+           "second operand of [us]mul_fix_[sat] must be an int type or vector "
            "of ints");
 
     auto *Op3 = cast<ConstantInt>(Call.getArgOperand(2));
     Assert(Op3->getType()->getBitWidth() <= 32,
-           "third argument of [us]mul_fix must fit within 32 bits");
+           "third argument of [us]mul_fix[_sat] must fit within 32 bits");
 
-    if (ID == Intrinsic::smul_fix) {
+    if (ID == Intrinsic::smul_fix || ID == Intrinsic::smul_fix_sat) {
       Assert(
           Op3->getZExtValue() < Op1->getType()->getScalarSizeInBits(),
-          "the scale of smul_fix must be less than the width of the operands");
+          "the scale of smul_fix[_sat] must be less than the width of the operands");
     } else {
       Assert(Op3->getZExtValue() <= Op1->getType()->getScalarSizeInBits(),
-             "the scale of umul_fix must be less than or equal to the width of "
+             "the scale of umul_fix[_sat] must be less than or equal to the width of "
              "the operands");
     }
     break;

Added: llvm/trunk/test/CodeGen/X86/smul_fix_sat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/smul_fix_sat.ll?rev=361289&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/smul_fix_sat.ll (added)
+++ llvm/trunk/test/CodeGen/X86/smul_fix_sat.ll Tue May 21 12:17:19 2019
@@ -0,0 +1,739 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86
+
+declare  i4  @llvm.smul.fix.sat.i4   (i4,  i4, i32)
+declare  i32 @llvm.smul.fix.sat.i32  (i32, i32, i32)
+declare  i64 @llvm.smul.fix.sat.i64  (i64, i64, i32)
+declare  <4 x i32> @llvm.smul.fix.sat.v4i32(<4 x i32>, <4 x i32>, i32)
+
+define i32 @func(i32 %x, i32 %y) nounwind {
+; X64-LABEL: func:
+; X64:       # %bb.0:
+; X64-NEXT:    movslq %esi, %rax
+; X64-NEXT:    movslq %edi, %rcx
+; X64-NEXT:    imulq %rax, %rcx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    shrdl $2, %eax, %ecx
+; X64-NEXT:    cmpl $1, %eax
+; X64-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
+; X64-NEXT:    cmovlel %ecx, %edx
+; X64-NEXT:    cmpl $-2, %eax
+; X64-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X64-NEXT:    cmovgel %edx, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    shrdl $2, %edx, %eax
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    cmovgl %ecx, %eax
+; X86-NEXT:    cmpl $-2, %edx
+; X86-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    retl
+  %tmp = call i32 @llvm.smul.fix.sat.i32(i32 %x, i32 %y, i32 2);
+  ret i32 %tmp;
+}
+
+define i64 @func2(i64 %x, i64 %y) nounwind {
+; X64-LABEL: func2:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    imulq %rsi
+; X64-NEXT:    shrdq $2, %rdx, %rax
+; X64-NEXT:    cmpq $1, %rdx
+; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    cmovgq %rcx, %rax
+; X64-NEXT:    cmpq $-2, %rdx
+; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; X64-NEXT:    cmovlq %rcx, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    imull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    cmovnsl %ecx, %edi
+; X86-NEXT:    cmovnsl %edx, %esi
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    sbbl $0, %ebp
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnsl %edi, %ebp
+; X86-NEXT:    cmovnsl %esi, %ecx
+; X86-NEXT:    testl %ebp, %ebp
+; X86-NEXT:    setg %bh
+; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    cmpl $1, %ecx
+; X86-NEXT:    seta %bl
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shldl $30, %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $30, %esi, %eax
+; X86-NEXT:    andb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
+; X86-NEXT:    orb %bh, %bl
+; X86-NEXT:    testb %bl, %bl
+; X86-NEXT:    movl $2147483647, %esi # imm = 0x7FFFFFFF
+; X86-NEXT:    cmovnel %esi, %edx
+; X86-NEXT:    movl $-1, %esi
+; X86-NEXT:    cmovnel %esi, %eax
+; X86-NEXT:    cmpl $-1, %ebp
+; X86-NEXT:    setl %bl
+; X86-NEXT:    sete %bh
+; X86-NEXT:    cmpl $-2, %ecx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    andb %bh, %cl
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    orb %bl, %cl
+; X86-NEXT:    cmovnel %esi, %eax
+; X86-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %tmp = call i64 @llvm.smul.fix.sat.i64(i64 %x, i64 %y, i32 2);
+  ret i64 %tmp;
+}
+
+define i4 @func3(i4 %x, i4 %y) nounwind {
+; X64-LABEL: func3:
+; X64:       # %bb.0:
+; X64-NEXT:    shlb $4, %sil
+; X64-NEXT:    sarb $4, %sil
+; X64-NEXT:    shlb $4, %dil
+; X64-NEXT:    movsbl %dil, %eax
+; X64-NEXT:    movsbl %sil, %ecx
+; X64-NEXT:    imull %eax, %ecx
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    shrb $2, %al
+; X64-NEXT:    shrl $8, %ecx
+; X64-NEXT:    movl %ecx, %edx
+; X64-NEXT:    shlb $6, %dl
+; X64-NEXT:    orb %al, %dl
+; X64-NEXT:    movzbl %dl, %eax
+; X64-NEXT:    cmpb $1, %cl
+; X64-NEXT:    movl $127, %edx
+; X64-NEXT:    cmovlel %eax, %edx
+; X64-NEXT:    cmpb $-2, %cl
+; X64-NEXT:    movl $128, %eax
+; X64-NEXT:    cmovgel %edx, %eax
+; X64-NEXT:    sarb $4, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func3:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    shlb $4, %al
+; X86-NEXT:    sarb $4, %al
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    shlb $4, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    movb %ah, %cl
+; X86-NEXT:    shlb $6, %cl
+; X86-NEXT:    shrb $2, %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    movzbl %al, %ecx
+; X86-NEXT:    cmpb $1, %ah
+; X86-NEXT:    movl $127, %edx
+; X86-NEXT:    cmovlel %ecx, %edx
+; X86-NEXT:    cmpb $-2, %ah
+; X86-NEXT:    movl $128, %eax
+; X86-NEXT:    cmovgel %edx, %eax
+; X86-NEXT:    sarb $4, %al
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %tmp = call i4 @llvm.smul.fix.sat.i4(i4 %x, i4 %y, i32 2);
+  ret i4 %tmp;
+}
+
+define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
+; X64-LABEL: vec:
+; X64:       # %bb.0:
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; X64-NEXT:    movd %xmm2, %eax
+; X64-NEXT:    cltq
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; X64-NEXT:    movd %xmm2, %ecx
+; X64-NEXT:    movslq %ecx, %rdx
+; X64-NEXT:    imulq %rax, %rdx
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    shrq $32, %rcx
+; X64-NEXT:    shrdl $2, %ecx, %edx
+; X64-NEXT:    cmpl $1, %ecx
+; X64-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmovgl %eax, %edx
+; X64-NEXT:    cmpl $-2, %ecx
+; X64-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
+; X64-NEXT:    cmovll %ecx, %edx
+; X64-NEXT:    movd %edx, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X64-NEXT:    movd %xmm3, %edx
+; X64-NEXT:    movslq %edx, %rdx
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; X64-NEXT:    movd %xmm3, %esi
+; X64-NEXT:    movslq %esi, %rsi
+; X64-NEXT:    imulq %rdx, %rsi
+; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    shrq $32, %rdx
+; X64-NEXT:    shrdl $2, %edx, %esi
+; X64-NEXT:    cmpl $1, %edx
+; X64-NEXT:    cmovgl %eax, %esi
+; X64-NEXT:    cmpl $-2, %edx
+; X64-NEXT:    cmovll %ecx, %esi
+; X64-NEXT:    movd %esi, %xmm3
+; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X64-NEXT:    movd %xmm1, %edx
+; X64-NEXT:    movslq %edx, %rdx
+; X64-NEXT:    movd %xmm0, %esi
+; X64-NEXT:    movslq %esi, %rsi
+; X64-NEXT:    imulq %rdx, %rsi
+; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    shrq $32, %rdx
+; X64-NEXT:    shrdl $2, %edx, %esi
+; X64-NEXT:    cmpl $1, %edx
+; X64-NEXT:    cmovgl %eax, %esi
+; X64-NEXT:    cmpl $-2, %edx
+; X64-NEXT:    cmovll %ecx, %esi
+; X64-NEXT:    movd %esi, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X64-NEXT:    movd %xmm1, %edx
+; X64-NEXT:    movslq %edx, %rdx
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X64-NEXT:    movd %xmm0, %esi
+; X64-NEXT:    movslq %esi, %rsi
+; X64-NEXT:    imulq %rdx, %rsi
+; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    shrq $32, %rdx
+; X64-NEXT:    shrdl $2, %edx, %esi
+; X64-NEXT:    cmpl $1, %edx
+; X64-NEXT:    cmovgl %eax, %esi
+; X64-NEXT:    cmpl $-2, %edx
+; X64-NEXT:    cmovll %ecx, %esi
+; X64-NEXT:    movd %esi, %xmm0
+; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X64-NEXT:    movdqa %xmm2, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: vec:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shrdl $2, %edx, %ecx
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl $2147483647, %ebp # imm = 0x7FFFFFFF
+; X86-NEXT:    cmovgl %ebp, %ecx
+; X86-NEXT:    cmpl $-2, %edx
+; X86-NEXT:    movl $-2147483648, %esi # imm = 0x80000000
+; X86-NEXT:    cmovll %esi, %ecx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    shrdl $2, %edx, %edi
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    cmovgl %ebp, %edi
+; X86-NEXT:    cmpl $-2, %edx
+; X86-NEXT:    cmovll %esi, %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    shrdl $2, %edx, %ebx
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    cmovgl %ebp, %ebx
+; X86-NEXT:    cmpl $-2, %edx
+; X86-NEXT:    cmovll %esi, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    shrdl $2, %edx, %eax
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    cmovgl %ebp, %eax
+; X86-NEXT:    cmpl $-2, %edx
+; X86-NEXT:    cmovll %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, 12(%edx)
+; X86-NEXT:    movl %ebx, 8(%edx)
+; X86-NEXT:    movl %edi, 4(%edx)
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %tmp = call <4 x i32> @llvm.smul.fix.sat.v4i32(<4 x i32> %x, <4 x i32> %y, i32 2);
+  ret <4 x i32> %tmp;
+}
+
+; These result in regular integer multiplication
+define i32 @func4(i32 %x, i32 %y) nounwind {
+; X64-LABEL: func4:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    imull %esi, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testl %ecx, %ecx
+; X64-NEXT:    setns %al
+; X64-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    imull %esi, %edi
+; X64-NEXT:    cmovnol %edi, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func4:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    imull %edx, %esi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    setns %cl
+; X86-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    imull %edx, %eax
+; X86-NEXT:    cmovol %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %tmp = call i32 @llvm.smul.fix.sat.i32(i32 %x, i32 %y, i32 0);
+  ret i32 %tmp;
+}
+
+define i64 @func5(i64 %x, i64 %y) {
+; X64-LABEL: func5:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    imulq %rsi, %rax
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    testq %rax, %rax
+; X64-NEXT:    setns %cl
+; X64-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    imulq %rsi, %rdi
+; X64-NEXT:    cmovnoq %rdi, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func5:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    .cfi_def_cfa_offset 16
+; X86-NEXT:    .cfi_offset %esi, -12
+; X86-NEXT:    .cfi_offset %edi, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl $0, (%esp)
+; X86-NEXT:    movl %esp, %edi
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll __mulodi4
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    .cfi_adjust_cfa_offset -20
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    setns %cl
+; X86-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    cmpl $0, (%esp)
+; X86-NEXT:    cmovnel %esi, %eax
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    popl %edi
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+  %tmp = call i64 @llvm.smul.fix.sat.i64(i64 %x, i64 %y, i32 0);
+  ret i64 %tmp;
+}
+
+define i4 @func6(i4 %x, i4 %y) nounwind {
+; X64-LABEL: func6:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shlb $4, %sil
+; X64-NEXT:    sarb $4, %sil
+; X64-NEXT:    shlb $4, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    imulb %sil
+; X64-NEXT:    seto %cl
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    setns %dl
+; X64-NEXT:    addl $127, %edx
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovnel %edx, %eax
+; X64-NEXT:    sarb $4, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func6:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    shlb $4, %cl
+; X86-NEXT:    sarb $4, %cl
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    shlb $4, %al
+; X86-NEXT:    imulb %cl
+; X86-NEXT:    seto %dl
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testb %al, %al
+; X86-NEXT:    setns %cl
+; X86-NEXT:    addl $127, %ecx
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    testb %dl, %dl
+; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    sarb $4, %al
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %tmp = call i4 @llvm.smul.fix.sat.i4(i4 %x, i4 %y, i32 0);
+  ret i4 %tmp;
+}
+
+define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
+; X64-LABEL: vec2:
+; X64:       # %bb.0:
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
+; X64-NEXT:    movd %xmm2, %ecx
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; X64-NEXT:    movd %xmm2, %r8d
+; X64-NEXT:    movl %r8d, %edx
+; X64-NEXT:    imull %ecx, %edx
+; X64-NEXT:    xorl %esi, %esi
+; X64-NEXT:    testl %edx, %edx
+; X64-NEXT:    setns %sil
+; X64-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; X64-NEXT:    imull %ecx, %r8d
+; X64-NEXT:    cmovol %esi, %r8d
+; X64-NEXT:    movd %xmm1, %edx
+; X64-NEXT:    movd %xmm0, %ecx
+; X64-NEXT:    movl %ecx, %esi
+; X64-NEXT:    imull %edx, %esi
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    testl %esi, %esi
+; X64-NEXT:    setns %dil
+; X64-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; X64-NEXT:    imull %edx, %ecx
+; X64-NEXT:    cmovol %edi, %ecx
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-NEXT:    movd %xmm2, %edx
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-NEXT:    movd %xmm2, %esi
+; X64-NEXT:    movl %esi, %edi
+; X64-NEXT:    imull %edx, %edi
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testl %edi, %edi
+; X64-NEXT:    setns %al
+; X64-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    imull %edx, %esi
+; X64-NEXT:    cmovol %eax, %esi
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; X64-NEXT:    movd %xmm1, %r9d
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X64-NEXT:    movd %xmm0, %edx
+; X64-NEXT:    movl %edx, %edi
+; X64-NEXT:    imull %r9d, %edi
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testl %edi, %edi
+; X64-NEXT:    setns %al
+; X64-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    imull %r9d, %edx
+; X64-NEXT:    cmovol %eax, %edx
+; X64-NEXT:    movd %edx, %xmm0
+; X64-NEXT:    movd %esi, %xmm1
+; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT:    movd %ecx, %xmm0
+; X64-NEXT:    movd %r8d, %xmm2
+; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT:    retq
+;
+; X86-LABEL: vec2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    imull %edx, %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    setns %al
+; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    imull %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmovol %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    imull %esi, %edi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    setns %al
+; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    imull %esi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmovol %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    imull %edi, %ebx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    setns %al
+; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    imull %edi, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmovol %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    imull %eax, %ebp
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    testl %ebp, %ebp
+; X86-NEXT:    setns %bl
+; X86-NEXT:    addl $2147483647, %ebx # imm = 0x7FFFFFFF
+; X86-NEXT:    imull %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmovol %ebx, %edi
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %tmp = call <4 x i32> @llvm.smul.fix.sat.v4i32(<4 x i32> %x, <4 x i32> %y, i32 0);
+  ret <4 x i32> %tmp;
+}
+
+define i64 @func7(i64 %x, i64 %y) nounwind {
+; X64-LABEL: func7:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    imulq %rsi
+; X64-NEXT:    shrdq $32, %rdx, %rax
+; X64-NEXT:    cmpq $2147483647, %rdx # imm = 0x7FFFFFFF
+; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    cmovgq %rcx, %rax
+; X64-NEXT:    cmpq $-2147483648, %rdx # imm = 0x80000000
+; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; X64-NEXT:    cmovlq %rcx, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func7:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnsl %ebp, %esi
+; X86-NEXT:    cmovnsl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnsl %esi, %edi
+; X86-NEXT:    cmovnsl %ecx, %edx
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    setg %cl
+; X86-NEXT:    sets %ch
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    setg %bl
+; X86-NEXT:    sete %bh
+; X86-NEXT:    andb %ch, %bh
+; X86-NEXT:    orb %bl, %bh
+; X86-NEXT:    movl $2147483647, %esi # imm = 0x7FFFFFFF
+; X86-NEXT:    cmovnel %esi, %edx
+; X86-NEXT:    movl $-1, %esi
+; X86-NEXT:    cmovnel %esi, %eax
+; X86-NEXT:    cmpl $-1, %edi
+; X86-NEXT:    setl %ch
+; X86-NEXT:    sete %bl
+; X86-NEXT:    andb %cl, %bl
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    orb %ch, %bl
+; X86-NEXT:    cmovnel %esi, %eax
+; X86-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %tmp = call i64 @llvm.smul.fix.sat.i64(i64 %x, i64 %y, i32 32);
+  ret i64 %tmp;
+}
+
+define i64 @func8(i64 %x, i64 %y) nounwind {
+; X64-LABEL: func8:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    imulq %rsi
+; X64-NEXT:    shrdq $63, %rdx, %rax
+; X64-NEXT:    movabsq $4611686018427387903, %rcx # imm = 0x3FFFFFFFFFFFFFFF
+; X64-NEXT:    cmpq %rcx, %rdx
+; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    cmovgq %rcx, %rax
+; X64-NEXT:    movabsq $-4611686018427387904, %rcx # imm = 0xC000000000000000
+; X64-NEXT:    cmpq %rcx, %rdx
+; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; X64-NEXT:    cmovlq %rcx, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func8:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnsl %ebx, %esi
+; X86-NEXT:    cmovnsl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    sbbl $0, %ebx
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnsl %esi, %ebx
+; X86-NEXT:    cmovnsl %ecx, %edi
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    shldl $1, %edi, %edx
+; X86-NEXT:    shrdl $31, %edi, %eax
+; X86-NEXT:    cmpl $1073741823, %ebx # imm = 0x3FFFFFFF
+; X86-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    cmovgl %ecx, %edx
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    cmovgl %ecx, %eax
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpl $-1073741824, %ebx # imm = 0xC0000000
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
+; X86-NEXT:    cmovll %ecx, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %tmp = call i64 @llvm.smul.fix.sat.i64(i64 %x, i64 %y, i32 63);
+  ret i64 %tmp;
+}

Added: llvm/trunk/test/CodeGen/X86/smul_fix_sat_constants.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/smul_fix_sat_constants.ll?rev=361289&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/smul_fix_sat_constants.ll (added)
+++ llvm/trunk/test/CodeGen/X86/smul_fix_sat_constants.ll Tue May 21 12:17:19 2019
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=X64
+
+; Verify expansion by using constant values. We just want to cover all the paths layed out by ExpandIntRes_MULFIX.
+
+declare  i4  @llvm.smul.fix.sat.i4   (i4,  i4, i32)
+declare  i32 @llvm.smul.fix.sat.i32  (i32, i32, i32)
+declare  i64 @llvm.smul.fix.sat.i64  (i64, i64, i32)
+declare  <4 x i32> @llvm.smul.fix.sat.v4i32(<4 x i32>, <4 x i32>, i32)
+declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64)
+
+define i64 @func() nounwind {
+; X64-LABEL: func:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $2, %ecx
+; X64-NEXT:    movl $3, %eax
+; X64-NEXT:    imulq %rcx
+; X64-NEXT:    shrdq $2, %rdx, %rax
+; X64-NEXT:    cmpq $1, %rdx
+; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    cmovgq %rcx, %rax
+; X64-NEXT:    cmpq $-2, %rdx
+; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; X64-NEXT:    cmovlq %rcx, %rax
+; X64-NEXT:    retq
+  %tmp = call i64 @llvm.smul.fix.sat.i64(i64 3, i64 2, i32 2);
+  ret i64 %tmp;
+}
+
+define i64 @func2() nounwind {
+; X64-LABEL: func2:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $3, %eax
+; X64-NEXT:    imulq $2, %rax, %rcx
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    testq %rcx, %rcx
+; X64-NEXT:    setns %dl
+; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    addq %rdx, %rcx
+; X64-NEXT:    imulq $2, %rax, %rax
+; X64-NEXT:    cmovoq %rcx, %rax
+; X64-NEXT:    retq
+  %tmp = call i64 @llvm.smul.fix.sat.i64(i64 3, i64 2, i32 0);
+  ret i64 %tmp;
+}
+
+define i64 @func3() nounwind {
+; X64-LABEL: func3:
+; X64:       # %bb.0:
+; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    imulq %rdx
+; X64-NEXT:    shrdq $2, %rdx, %rax
+; X64-NEXT:    cmpq $1, %rdx
+; X64-NEXT:    cmovgq %rcx, %rax
+; X64-NEXT:    cmpq $-2, %rdx
+; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; X64-NEXT:    cmovlq %rcx, %rax
+; X64-NEXT:    retq
+  %tmp = call i64 @llvm.smul.fix.sat.i64(i64 9223372036854775807, i64 2, i32 2);
+  ret i64 %tmp;
+}
+
+define i64 @func4() nounwind {
+; X64-LABEL: func4:
+; X64:       # %bb.0:
+; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    imulq %rdx
+; X64-NEXT:    shrdq $32, %rdx, %rax
+; X64-NEXT:    cmpq $2147483647, %rdx # imm = 0x7FFFFFFF
+; X64-NEXT:    cmovgq %rcx, %rax
+; X64-NEXT:    cmpq $-2147483648, %rdx # imm = 0x80000000
+; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; X64-NEXT:    cmovlq %rcx, %rax
+; X64-NEXT:    retq
+  %tmp = call i64 @llvm.smul.fix.sat.i64(i64 9223372036854775807, i64 2, i32 32);
+  ret i64 %tmp;
+}
+
+define i64 @func5() nounwind {
+; X64-LABEL: func5:
+; X64:       # %bb.0:
+; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    imulq %rdx
+; X64-NEXT:    shrdq $63, %rdx, %rax
+; X64-NEXT:    movabsq $4611686018427387903, %rsi # imm = 0x3FFFFFFFFFFFFFFF
+; X64-NEXT:    cmpq %rsi, %rdx
+; X64-NEXT:    cmovgq %rcx, %rax
+; X64-NEXT:    movabsq $-4611686018427387904, %rcx # imm = 0xC000000000000000
+; X64-NEXT:    cmpq %rcx, %rdx
+; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; X64-NEXT:    cmovlq %rcx, %rax
+; X64-NEXT:    retq
+  %tmp = call i64 @llvm.smul.fix.sat.i64(i64 9223372036854775807, i64 2, i32 63);
+  ret i64 %tmp;
+}