[llvm] [SelectionDAG] Expand fixed point multiplication into libcall (PR #79352)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 24 11:57:47 PST 2024
https://github.com/PiJoules created https://github.com/llvm/llvm-project/pull/79352
32-bit ARMv6 with thumb doesn't support MULHS/MUL_LOHI as legal/custom nodes during expansion which will cause fixed point multiplication of _Accum types to fail with fixed point arithmetic. Prior to this, we just happen to use fixed point multiplication on platforms that happen to support these MULHS/MUL_LOHI.
This patch attempts to check if the multiplication can be done via libcalls, which are provided by the arm runtime.
>From 82807166d42ea722f5574015c7a8656b5a40883a Mon Sep 17 00:00:00 2001
From: Leonard Chan <leonardchan at google.com>
Date: Wed, 24 Jan 2024 11:54:14 -0800
Subject: [PATCH] [SelectionDAG] Expand fixed point multiplication into libcall
32-bit ARMv6 with thumb doesn't support MULHS/MUL_LOHI as legal/custom nodes
during expansion which will cause fixed point multiplication of _Accum
types to fail with fixed point arithmetic. Prior to this, we just happen
to use fixed point multiplication on platforms that happen to support
these MULHS/MUL_LOHI.
This patch attempts to check if the multiplication can be done via
libcalls, which are provided by the arm runtime.
---
llvm/include/llvm/CodeGen/TargetLowering.h | 17 +
.../SelectionDAG/LegalizeIntegerTypes.cpp | 49 +-
.../CodeGen/SelectionDAG/TargetLowering.cpp | 181 +++--
llvm/test/CodeGen/Thumb/smul_fix.ll | 304 ++++++++
llvm/test/CodeGen/Thumb/smul_fix_sat.ll | 690 ++++++++++++++++++
llvm/test/CodeGen/Thumb/umul_fix.ll | 375 ++++++++++
llvm/test/CodeGen/Thumb/umul_fix_sat.ll | 519 +++++++++++++
7 files changed, 2031 insertions(+), 104 deletions(-)
create mode 100644 llvm/test/CodeGen/Thumb/smul_fix.ll
create mode 100644 llvm/test/CodeGen/Thumb/smul_fix_sat.ll
create mode 100644 llvm/test/CodeGen/Thumb/umul_fix.ll
create mode 100644 llvm/test/CodeGen/Thumb/umul_fix_sat.ll
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index c9492b4cf778b65..b29be069699f0ca 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5287,6 +5287,23 @@ class TargetLowering : public TargetLoweringBase {
bool expandMULO(SDNode *Node, SDValue &Result, SDValue &Overflow,
SelectionDAG &DAG) const;
+ /// ForceExpandMUL - Unconditionally expand a MUL into either a libcall or
+ /// brute force involving many multiplications. The expansion works by
+ /// attempting to do a multiplication on a wider type twice the size of the
+ /// original operands. LL and LH represent the lower and upper halves of the
+ /// first operand. RL and RH represent the lower and upper halves of the
+ /// second operand. The upper and lower halves of the result are stored in Lo
+ /// and Hi.
+ void ForceExpandMUL(SelectionDAG &DAG, SDLoc dl, bool Signed, EVT WideVT,
+ const SDValue LL, const SDValue LH, const SDValue RL,
+ const SDValue RH, SDValue &Lo, SDValue &Hi) const;
+
+ /// Same as above, but creates the upper halves of each operand by
+ /// sign/zero-extending the operands.
+ void ForceExpandMUL(SelectionDAG &DAG, SDLoc dl, bool Signed,
+ const SDValue LHS, const SDValue RHS, SDValue &Lo,
+ SDValue &Hi) const;
+
/// Expand a VECREDUCE_* into an explicit calculation. If Count is specified,
/// only the first Count elements of the vector are used.
SDValue expandVecReduce(SDNode *Node, SelectionDAG &DAG) const;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 814f746f5a4d9db..e9f7e30863733cf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4008,44 +4008,7 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
LC = RTLIB::MUL_I128;
if (LC == RTLIB::UNKNOWN_LIBCALL || !TLI.getLibcallName(LC)) {
- // We'll expand the multiplication by brute force because we have no other
- // options. This is a trivially-generalized version of the code from
- // Hacker's Delight (itself derived from Knuth's Algorithm M from section
- // 4.3.1).
- unsigned Bits = NVT.getSizeInBits();
- unsigned HalfBits = Bits >> 1;
- SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl,
- NVT);
- SDValue LLL = DAG.getNode(ISD::AND, dl, NVT, LL, Mask);
- SDValue RLL = DAG.getNode(ISD::AND, dl, NVT, RL, Mask);
-
- SDValue T = DAG.getNode(ISD::MUL, dl, NVT, LLL, RLL);
- SDValue TL = DAG.getNode(ISD::AND, dl, NVT, T, Mask);
-
- SDValue Shift = DAG.getShiftAmountConstant(HalfBits, NVT, dl);
- SDValue TH = DAG.getNode(ISD::SRL, dl, NVT, T, Shift);
- SDValue LLH = DAG.getNode(ISD::SRL, dl, NVT, LL, Shift);
- SDValue RLH = DAG.getNode(ISD::SRL, dl, NVT, RL, Shift);
-
- SDValue U = DAG.getNode(ISD::ADD, dl, NVT,
- DAG.getNode(ISD::MUL, dl, NVT, LLH, RLL), TH);
- SDValue UL = DAG.getNode(ISD::AND, dl, NVT, U, Mask);
- SDValue UH = DAG.getNode(ISD::SRL, dl, NVT, U, Shift);
-
- SDValue V = DAG.getNode(ISD::ADD, dl, NVT,
- DAG.getNode(ISD::MUL, dl, NVT, LLL, RLH), UL);
- SDValue VH = DAG.getNode(ISD::SRL, dl, NVT, V, Shift);
-
- SDValue W = DAG.getNode(ISD::ADD, dl, NVT,
- DAG.getNode(ISD::MUL, dl, NVT, LLH, RLH),
- DAG.getNode(ISD::ADD, dl, NVT, UH, VH));
- Lo = DAG.getNode(ISD::ADD, dl, NVT, TL,
- DAG.getNode(ISD::SHL, dl, NVT, V, Shift));
-
- Hi = DAG.getNode(ISD::ADD, dl, NVT, W,
- DAG.getNode(ISD::ADD, dl, NVT,
- DAG.getNode(ISD::MUL, dl, NVT, RH, LL),
- DAG.getNode(ISD::MUL, dl, NVT, RL, LH)));
+ TLI.ForceExpandMUL(DAG, dl, /*Signed=*/true, VT, LL, LH, RL, RH, Lo, Hi);
return;
}
@@ -4146,9 +4109,15 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
if (!TLI.expandMUL_LOHI(LoHiOp, VT, dl, LHS, RHS, Result, NVT, DAG,
TargetLowering::MulExpansionKind::OnlyLegalOrCustom,
LL, LH, RL, RH)) {
- report_fatal_error("Unable to expand MUL_FIX using MUL_LOHI.");
- return;
+ Result.clear();
+ Result.resize(4);
+
+ SDValue LoTmp, HiTmp;
+ TLI.ForceExpandMUL(DAG, dl, Signed, LHS, RHS, LoTmp, HiTmp);
+ SplitInteger(LoTmp, Result[0], Result[1]);
+ SplitInteger(HiTmp, Result[2], Result[3]);
}
+ assert(Result.size() == 4 && "Unexpected number of partlets in the result");
unsigned NVTSize = NVT.getScalarSizeInBits();
assert((VTSize == NVTSize * 2) && "Expected the new value type to be half "
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index b8ed02e268b1842..2fe1dd4ce15e167 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10149,6 +10149,121 @@ SDValue TargetLowering::expandShlSat(SDNode *Node, SelectionDAG &DAG) const {
return DAG.getSelect(dl, VT, Cond, SatVal, Result);
}
+void TargetLowering::ForceExpandMUL(SelectionDAG &DAG, SDLoc dl, bool Signed,
+ EVT WideVT, const SDValue LL,
+ const SDValue LH, const SDValue RL,
+ const SDValue RH, SDValue &Lo,
+ SDValue &Hi) const {
+ // We can fall back to a libcall with an illegal type for the MUL if we
+ // have a libcall big enough.
+ // Also, we can fall back to a division in some cases, but that's a big
+ // performance hit in the general case.
+ RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+ if (WideVT == MVT::i16)
+ LC = RTLIB::MUL_I16;
+ else if (WideVT == MVT::i32)
+ LC = RTLIB::MUL_I32;
+ else if (WideVT == MVT::i64)
+ LC = RTLIB::MUL_I64;
+ else if (WideVT == MVT::i128)
+ LC = RTLIB::MUL_I128;
+
+ if (LC == RTLIB::UNKNOWN_LIBCALL || !getLibcallName(LC)) {
+ // We'll expand the multiplication by brute force because we have no other
+ // options. This is a trivially-generalized version of the code from
+ // Hacker's Delight (itself derived from Knuth's Algorithm M from section
+ // 4.3.1).
+ EVT VT = LL.getValueType();
+ unsigned Bits = VT.getSizeInBits();
+ unsigned HalfBits = Bits >> 1;
+ SDValue Mask =
+ DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl, VT);
+ SDValue LLL = DAG.getNode(ISD::AND, dl, VT, LL, Mask);
+ SDValue RLL = DAG.getNode(ISD::AND, dl, VT, RL, Mask);
+
+ SDValue T = DAG.getNode(ISD::MUL, dl, VT, LLL, RLL);
+ SDValue TL = DAG.getNode(ISD::AND, dl, VT, T, Mask);
+
+ SDValue Shift = DAG.getShiftAmountConstant(HalfBits, VT, dl);
+ SDValue TH = DAG.getNode(ISD::SRL, dl, VT, T, Shift);
+ SDValue LLH = DAG.getNode(ISD::SRL, dl, VT, LL, Shift);
+ SDValue RLH = DAG.getNode(ISD::SRL, dl, VT, RL, Shift);
+
+ SDValue U = DAG.getNode(ISD::ADD, dl, VT,
+ DAG.getNode(ISD::MUL, dl, VT, LLH, RLL), TH);
+ SDValue UL = DAG.getNode(ISD::AND, dl, VT, U, Mask);
+ SDValue UH = DAG.getNode(ISD::SRL, dl, VT, U, Shift);
+
+ SDValue V = DAG.getNode(ISD::ADD, dl, VT,
+ DAG.getNode(ISD::MUL, dl, VT, LLL, RLH), UL);
+ SDValue VH = DAG.getNode(ISD::SRL, dl, VT, V, Shift);
+
+ SDValue W =
+ DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LLH, RLH),
+ DAG.getNode(ISD::ADD, dl, VT, UH, VH));
+ Lo = DAG.getNode(ISD::ADD, dl, VT, TL,
+ DAG.getNode(ISD::SHL, dl, VT, V, Shift));
+
+ Hi = DAG.getNode(ISD::ADD, dl, VT, W,
+ DAG.getNode(ISD::ADD, dl, VT,
+ DAG.getNode(ISD::MUL, dl, VT, RH, LL),
+ DAG.getNode(ISD::MUL, dl, VT, RL, LH)));
+ } else {
+ // Attempt a libcall.
+ SDValue Ret;
+ TargetLowering::MakeLibCallOptions CallOptions;
+ CallOptions.setSExt(Signed);
+ CallOptions.setIsPostTypeLegalization(true);
+ if (shouldSplitFunctionArgumentsAsLittleEndian(DAG.getDataLayout())) {
+ // Halves of WideVT are packed into registers in different order
+ // depending on platform endianness. This is usually handled by
+ // the C calling convention, but we can't defer to it in
+ // the legalizer.
+ SDValue Args[] = {LL, LH, RL, RH};
+ Ret = makeLibCall(DAG, LC, WideVT, Args, CallOptions, dl).first;
+ } else {
+ SDValue Args[] = {LH, LL, RH, RL};
+ Ret = makeLibCall(DAG, LC, WideVT, Args, CallOptions, dl).first;
+ }
+ assert(Ret.getOpcode() == ISD::MERGE_VALUES &&
+ "Ret value is a collection of constituent nodes holding result.");
+ if (DAG.getDataLayout().isLittleEndian()) {
+ // Same as above.
+ Lo = Ret.getOperand(0);
+ Hi = Ret.getOperand(1);
+ } else {
+ Lo = Ret.getOperand(1);
+ Hi = Ret.getOperand(0);
+ }
+ }
+}
+
+void TargetLowering::ForceExpandMUL(SelectionDAG &DAG, SDLoc dl, bool Signed,
+ const SDValue LHS, const SDValue RHS,
+ SDValue &Lo, SDValue &Hi) const {
+ EVT VT = LHS.getValueType();
+ assert(RHS.getValueType() == VT && "Mismatching operand types");
+
+ SDValue HiLHS;
+ SDValue HiRHS;
+ if (Signed) {
+ // The high part is obtained by SRA'ing all but one of the bits of low
+ // part.
+ unsigned LoSize = VT.getFixedSizeInBits();
+ HiLHS = DAG.getNode(
+ ISD::SRA, dl, VT, LHS,
+ DAG.getConstant(LoSize - 1, dl, getPointerTy(DAG.getDataLayout())));
+ HiRHS = DAG.getNode(
+ ISD::SRA, dl, VT, RHS,
+ DAG.getConstant(LoSize - 1, dl, getPointerTy(DAG.getDataLayout())));
+ } else {
+ HiLHS = DAG.getConstant(0, dl, VT);
+ HiRHS = DAG.getConstant(0, dl, VT);
+ }
+ EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() * 2);
+ ForceExpandMUL(DAG, dl, Signed, WideVT, LHS, HiLHS, RHS, HiRHS, Lo, Hi);
+}
+
SDValue
TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
assert((Node->getOpcode() == ISD::SMULFIX ||
@@ -10223,7 +10338,7 @@ TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
} else if (VT.isVector()) {
return SDValue();
} else {
- report_fatal_error("Unable to expand fixed point multiplication.");
+ ForceExpandMUL(DAG, dl, Signed, LHS, RHS, Lo, Hi);
}
if (Scale == VTSize)
@@ -10522,69 +10637,7 @@ bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result,
if (VT.isVector())
return false;
- // We can fall back to a libcall with an illegal type for the MUL if we
- // have a libcall big enough.
- // Also, we can fall back to a division in some cases, but that's a big
- // performance hit in the general case.
- RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
- if (WideVT == MVT::i16)
- LC = RTLIB::MUL_I16;
- else if (WideVT == MVT::i32)
- LC = RTLIB::MUL_I32;
- else if (WideVT == MVT::i64)
- LC = RTLIB::MUL_I64;
- else if (WideVT == MVT::i128)
- LC = RTLIB::MUL_I128;
- assert(LC != RTLIB::UNKNOWN_LIBCALL && "Cannot expand this operation!");
-
- SDValue HiLHS;
- SDValue HiRHS;
- if (isSigned) {
- // The high part is obtained by SRA'ing all but one of the bits of low
- // part.
- unsigned LoSize = VT.getFixedSizeInBits();
- HiLHS =
- DAG.getNode(ISD::SRA, dl, VT, LHS,
- DAG.getConstant(LoSize - 1, dl,
- getPointerTy(DAG.getDataLayout())));
- HiRHS =
- DAG.getNode(ISD::SRA, dl, VT, RHS,
- DAG.getConstant(LoSize - 1, dl,
- getPointerTy(DAG.getDataLayout())));
- } else {
- HiLHS = DAG.getConstant(0, dl, VT);
- HiRHS = DAG.getConstant(0, dl, VT);
- }
-
- // Here we're passing the 2 arguments explicitly as 4 arguments that are
- // pre-lowered to the correct types. This all depends upon WideVT not
- // being a legal type for the architecture and thus has to be split to
- // two arguments.
- SDValue Ret;
- TargetLowering::MakeLibCallOptions CallOptions;
- CallOptions.setSExt(isSigned);
- CallOptions.setIsPostTypeLegalization(true);
- if (shouldSplitFunctionArgumentsAsLittleEndian(DAG.getDataLayout())) {
- // Halves of WideVT are packed into registers in different order
- // depending on platform endianness. This is usually handled by
- // the C calling convention, but we can't defer to it in
- // the legalizer.
- SDValue Args[] = { LHS, HiLHS, RHS, HiRHS };
- Ret = makeLibCall(DAG, LC, WideVT, Args, CallOptions, dl).first;
- } else {
- SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
- Ret = makeLibCall(DAG, LC, WideVT, Args, CallOptions, dl).first;
- }
- assert(Ret.getOpcode() == ISD::MERGE_VALUES &&
- "Ret value is a collection of constituent nodes holding result.");
- if (DAG.getDataLayout().isLittleEndian()) {
- // Same as above.
- BottomHalf = Ret.getOperand(0);
- TopHalf = Ret.getOperand(1);
- } else {
- BottomHalf = Ret.getOperand(1);
- TopHalf = Ret.getOperand(0);
- }
+ ForceExpandMUL(DAG, dl, isSigned, LHS, RHS, BottomHalf, TopHalf);
}
Result = BottomHalf;
diff --git a/llvm/test/CodeGen/Thumb/smul_fix.ll b/llvm/test/CodeGen/Thumb/smul_fix.ll
new file mode 100644
index 000000000000000..52f241802b87e3e
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb/smul_fix.ll
@@ -0,0 +1,304 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=thumbv6m-none-unknown-eabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=ARM
+
+declare i4 @llvm.smul.fix.i4 (i4, i4, i32)
+declare i32 @llvm.smul.fix.i32 (i32, i32, i32)
+declare i64 @llvm.smul.fix.i64 (i64, i64, i32)
+
+define i32 @func(i32 %x, i32 %y) nounwind {
+; ARM-LABEL: func:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r7, lr}
+; ARM-NEXT: push {r7, lr}
+; ARM-NEXT: mov r2, r1
+; ARM-NEXT: asrs r1, r0, #31
+; ARM-NEXT: asrs r3, r2, #31
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: lsrs r0, r0, #2
+; ARM-NEXT: lsls r1, r1, #30
+; ARM-NEXT: adds r0, r1, r0
+; ARM-NEXT: pop {r7, pc}
+ %tmp = call i32 @llvm.smul.fix.i32(i32 %x, i32 %y, i32 2)
+ ret i32 %tmp
+}
+
+define i64 @func2(i64 %x, i64 %y) {
+; ARM-LABEL: func2:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, r5, r6, r7, lr}
+; ARM-NEXT: push {r4, r5, r6, r7, lr}
+; ARM-NEXT: .pad #28
+; ARM-NEXT: sub sp, #28
+; ARM-NEXT: str r3, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r5, r2
+; ARM-NEXT: str r2, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: mov r7, r1
+; ARM-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: movs r6, #0
+; ARM-NEXT: mov r4, r0
+; ARM-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r3, r6
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: str r0, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: str r1, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: mov r0, r7
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r2, r5
+; ARM-NEXT: mov r3, r6
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r5, r1
+; ARM-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: adcs r5, r6
+; ARM-NEXT: mov r0, r4
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r3, r6
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r4, r1
+; ARM-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: adcs r4, r5
+; ARM-NEXT: ldr r5, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: mov r0, r5
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r3, r6
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r4
+; ARM-NEXT: str r0, [sp] @ 4-byte Spill
+; ARM-NEXT: asrs r2, r5, #31
+; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: mov r1, r7
+; ARM-NEXT: mov r3, r2
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r4, r0
+; ARM-NEXT: asrs r0, r7, #31
+; ARM-NEXT: mov r1, r0
+; ARM-NEXT: ldr r2, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r4
+; ARM-NEXT: ldr r1, [sp] @ 4-byte Reload
+; ARM-NEXT: adds r0, r1, r0
+; ARM-NEXT: lsls r0, r0, #30
+; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: lsrs r1, r2, #2
+; ARM-NEXT: adds r1, r0, r1
+; ARM-NEXT: lsls r0, r2, #30
+; ARM-NEXT: ldr r2, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: lsrs r2, r2, #2
+; ARM-NEXT: adds r0, r0, r2
+; ARM-NEXT: add sp, #28
+; ARM-NEXT: pop {r4, r5, r6, r7, pc}
+ %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 2)
+ ret i64 %tmp
+}
+
+define i4 @func3(i4 %x, i4 %y) nounwind {
+; ARM-LABEL: func3:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, lr}
+; ARM-NEXT: push {r4, lr}
+; ARM-NEXT: lsls r2, r0, #28
+; ARM-NEXT: asrs r0, r2, #28
+; ARM-NEXT: asrs r4, r2, #31
+; ARM-NEXT: lsls r1, r1, #28
+; ARM-NEXT: asrs r2, r1, #28
+; ARM-NEXT: asrs r3, r1, #31
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: lsrs r0, r0, #2
+; ARM-NEXT: lsls r1, r1, #30
+; ARM-NEXT: adds r0, r1, r0
+; ARM-NEXT: pop {r4, pc}
+ %tmp = call i4 @llvm.smul.fix.i4(i4 %x, i4 %y, i32 2)
+ ret i4 %tmp
+}
+
+;; These result in regular integer multiplication
+define i32 @func4(i32 %x, i32 %y) nounwind {
+; ARM-LABEL: func4:
+; ARM: @ %bb.0:
+; ARM-NEXT: muls r0, r1, r0
+; ARM-NEXT: bx lr
+ %tmp = call i32 @llvm.smul.fix.i32(i32 %x, i32 %y, i32 0)
+ ret i32 %tmp
+}
+
+define i64 @func5(i64 %x, i64 %y) {
+; ARM-LABEL: func5:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r7, lr}
+; ARM-NEXT: push {r7, lr}
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: pop {r7, pc}
+ %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 0)
+ ret i64 %tmp
+}
+
+define i4 @func6(i4 %x, i4 %y) nounwind {
+; ARM-LABEL: func6:
+; ARM: @ %bb.0:
+; ARM-NEXT: lsls r1, r1, #28
+; ARM-NEXT: asrs r1, r1, #28
+; ARM-NEXT: lsls r0, r0, #28
+; ARM-NEXT: asrs r0, r0, #28
+; ARM-NEXT: muls r0, r1, r0
+; ARM-NEXT: bx lr
+ %tmp = call i4 @llvm.smul.fix.i4(i4 %x, i4 %y, i32 0)
+ ret i4 %tmp
+}
+
+define i64 @func7(i64 %x, i64 %y) nounwind {
+; ARM-LABEL: func7:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, r5, r6, r7, lr}
+; ARM-NEXT: push {r4, r5, r6, r7, lr}
+; ARM-NEXT: .pad #20
+; ARM-NEXT: sub sp, #20
+; ARM-NEXT: str r3, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: mov r7, r2
+; ARM-NEXT: str r2, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r6, r1
+; ARM-NEXT: str r1, [sp] @ 4-byte Spill
+; ARM-NEXT: movs r5, #0
+; ARM-NEXT: mov r4, r0
+; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: mov r0, r6
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r7, r1
+; ARM-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: adcs r7, r5
+; ARM-NEXT: mov r0, r4
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: mov r2, r6
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r4, r1
+; ARM-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: adcs r4, r7
+; ARM-NEXT: ldr r7, [sp] @ 4-byte Reload
+; ARM-NEXT: mov r0, r7
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: mov r2, r6
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r5, r0, r4
+; ARM-NEXT: asrs r2, r7, #31
+; ARM-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r3, r2
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r4, r0
+; ARM-NEXT: asrs r0, r6, #31
+; ARM-NEXT: mov r1, r0
+; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: mov r3, r7
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r4
+; ARM-NEXT: adds r1, r5, r0
+; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: add sp, #20
+; ARM-NEXT: pop {r4, r5, r6, r7, pc}
+ %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 32)
+ ret i64 %tmp
+}
+
+define i64 @func8(i64 %x, i64 %y) nounwind {
+; ARM-LABEL: func8:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, r5, r6, r7, lr}
+; ARM-NEXT: push {r4, r5, r6, r7, lr}
+; ARM-NEXT: .pad #28
+; ARM-NEXT: sub sp, #28
+; ARM-NEXT: str r3, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r4, r1
+; ARM-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: movs r5, #0
+; ARM-NEXT: mov r6, r0
+; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: mov r7, r2
+; ARM-NEXT: str r2, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: str r1, [sp, #20] @ 4-byte Spill
+; ARM-NEXT: mov r0, r4
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r4, r1
+; ARM-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: adds r7, r0, r1
+; ARM-NEXT: adcs r4, r5
+; ARM-NEXT: mov r0, r6
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: mov r2, r6
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r7
+; ARM-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; ARM-NEXT: adcs r1, r5
+; ARM-NEXT: adds r0, r4, r1
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: mov r7, r5
+; ARM-NEXT: adcs r7, r5
+; ARM-NEXT: ldr r4, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: mov r0, r4
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: mov r2, r6
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r5, r1
+; ARM-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: adcs r5, r7
+; ARM-NEXT: asrs r2, r4, #31
+; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r3, r2
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r4, r0
+; ARM-NEXT: mov r7, r1
+; ARM-NEXT: asrs r0, r6, #31
+; ARM-NEXT: mov r1, r0
+; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: ldr r3, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r4
+; ARM-NEXT: adcs r1, r7
+; ARM-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: adds r0, r2, r0
+; ARM-NEXT: adcs r1, r5
+; ARM-NEXT: lsls r1, r1, #1
+; ARM-NEXT: lsrs r2, r0, #31
+; ARM-NEXT: adds r1, r1, r2
+; ARM-NEXT: lsls r0, r0, #1
+; ARM-NEXT: ldr r2, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: lsrs r2, r2, #31
+; ARM-NEXT: adds r0, r0, r2
+; ARM-NEXT: add sp, #28
+; ARM-NEXT: pop {r4, r5, r6, r7, pc}
+ %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 63)
+ ret i64 %tmp
+}
diff --git a/llvm/test/CodeGen/Thumb/smul_fix_sat.ll b/llvm/test/CodeGen/Thumb/smul_fix_sat.ll
new file mode 100644
index 000000000000000..f8557419c419904
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb/smul_fix_sat.ll
@@ -0,0 +1,690 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=thumbv6m-none-unknown-eabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=ARM
+
+declare i4 @llvm.smul.fix.sat.i4 (i4, i4, i32)
+declare i32 @llvm.smul.fix.sat.i32 (i32, i32, i32)
+declare i64 @llvm.smul.fix.sat.i64 (i64, i64, i32)
+
+define i32 @func(i32 %x, i32 %y) nounwind {
+; ARM-LABEL: func:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r7, lr}
+; ARM-NEXT: push {r7, lr}
+; ARM-NEXT: mov r2, r1
+; ARM-NEXT: asrs r1, r0, #31
+; ARM-NEXT: asrs r3, r2, #31
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: cmp r1, #1
+; ARM-NEXT: bgt .LBB0_2
+; ARM-NEXT: @ %bb.1:
+; ARM-NEXT: lsrs r0, r0, #2
+; ARM-NEXT: lsls r2, r1, #30
+; ARM-NEXT: adds r0, r2, r0
+; ARM-NEXT: b .LBB0_3
+; ARM-NEXT: .LBB0_2:
+; ARM-NEXT: ldr r0, .LCPI0_0
+; ARM-NEXT: .LBB0_3:
+; ARM-NEXT: movs r2, #1
+; ARM-NEXT: mvns r3, r2
+; ARM-NEXT: cmp r1, r3
+; ARM-NEXT: bge .LBB0_5
+; ARM-NEXT: @ %bb.4:
+; ARM-NEXT: lsls r0, r2, #31
+; ARM-NEXT: .LBB0_5:
+; ARM-NEXT: pop {r7, pc}
+; ARM-NEXT: .p2align 2
+; ARM-NEXT: @ %bb.6:
+; ARM-NEXT: .LCPI0_0:
+; ARM-NEXT: .long 2147483647 @ 0x7fffffff
+ %tmp = call i32 @llvm.smul.fix.sat.i32(i32 %x, i32 %y, i32 2)
+ ret i32 %tmp
+}
+
+define i64 @func2(i64 %x, i64 %y) nounwind {
+; ARM-LABEL: func2:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, r5, r6, r7, lr}
+; ARM-NEXT: push {r4, r5, r6, r7, lr}
+; ARM-NEXT: .pad #36
+; ARM-NEXT: sub sp, #36
+; ARM-NEXT: str r3, [sp, #28] @ 4-byte Spill
+; ARM-NEXT: mov r6, r1
+; ARM-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: movs r4, #0
+; ARM-NEXT: str r4, [sp, #32] @ 4-byte Spill
+; ARM-NEXT: mov r5, r0
+; ARM-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r7, r2
+; ARM-NEXT: str r2, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: mov r0, r6
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r6, r1
+; ARM-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: adds r7, r0, r1
+; ARM-NEXT: adcs r6, r4
+; ARM-NEXT: mov r0, r5
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: ldr r5, [sp, #28] @ 4-byte Reload
+; ARM-NEXT: mov r2, r5
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r7
+; ARM-NEXT: str r0, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: adcs r1, r4
+; ARM-NEXT: adds r0, r6, r1
+; ARM-NEXT: str r0, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: mov r6, r4
+; ARM-NEXT: adcs r6, r4
+; ARM-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: mov r0, r7
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r2, r5
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r5, r1
+; ARM-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: adcs r5, r6
+; ARM-NEXT: asrs r2, r7, #31
+; ARM-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: ldr r4, [sp, #28] @ 4-byte Reload
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r3, r2
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r6, r0
+; ARM-NEXT: str r1, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: asrs r0, r4, #31
+; ARM-NEXT: mov r1, r0
+; ARM-NEXT: ldr r2, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: mov r3, r7
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r6
+; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: adcs r1, r2
+; ARM-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: adds r0, r2, r0
+; ARM-NEXT: adcs r1, r5
+; ARM-NEXT: rsbs r5, r1, #0
+; ARM-NEXT: adcs r5, r1
+; ARM-NEXT: movs r2, #1
+; ARM-NEXT: str r0, [sp, #28] @ 4-byte Spill
+; ARM-NEXT: cmp r0, #1
+; ARM-NEXT: mov r3, r2
+; ARM-NEXT: bhi .LBB1_2
+; ARM-NEXT: @ %bb.1:
+; ARM-NEXT: ldr r3, [sp, #32] @ 4-byte Reload
+; ARM-NEXT: .LBB1_2:
+; ARM-NEXT: ands r5, r3
+; ARM-NEXT: cmp r1, #0
+; ARM-NEXT: mov r3, r2
+; ARM-NEXT: bgt .LBB1_4
+; ARM-NEXT: @ %bb.3:
+; ARM-NEXT: ldr r3, [sp, #32] @ 4-byte Reload
+; ARM-NEXT: .LBB1_4:
+; ARM-NEXT: orrs r3, r5
+; ARM-NEXT: ldr r0, [sp, #32] @ 4-byte Reload
+; ARM-NEXT: mvns r6, r0
+; ARM-NEXT: cmp r3, #0
+; ARM-NEXT: str r6, [sp, #20] @ 4-byte Spill
+; ARM-NEXT: bne .LBB1_6
+; ARM-NEXT: @ %bb.5:
+; ARM-NEXT: ldr r0, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: lsls r0, r0, #30
+; ARM-NEXT: ldr r4, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: lsrs r4, r4, #2
+; ARM-NEXT: adds r0, r0, r4
+; ARM-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; ARM-NEXT: .LBB1_6:
+; ARM-NEXT: adds r0, r1, #1
+; ARM-NEXT: rsbs r7, r0, #0
+; ARM-NEXT: adcs r7, r0
+; ARM-NEXT: mvns r0, r2
+; ARM-NEXT: ldr r5, [sp, #28] @ 4-byte Reload
+; ARM-NEXT: cmp r5, r0
+; ARM-NEXT: mov r0, r2
+; ARM-NEXT: blo .LBB1_8
+; ARM-NEXT: @ %bb.7:
+; ARM-NEXT: ldr r0, [sp, #32] @ 4-byte Reload
+; ARM-NEXT: .LBB1_8:
+; ARM-NEXT: ands r7, r0
+; ARM-NEXT: cmp r1, r6
+; ARM-NEXT: mov r6, r2
+; ARM-NEXT: bge .LBB1_12
+; ARM-NEXT: @ %bb.9:
+; ARM-NEXT: orrs r6, r7
+; ARM-NEXT: beq .LBB1_13
+; ARM-NEXT: .LBB1_10:
+; ARM-NEXT: cmp r3, #0
+; ARM-NEXT: bne .LBB1_14
+; ARM-NEXT: .LBB1_11:
+; ARM-NEXT: ldr r0, [sp, #28] @ 4-byte Reload
+; ARM-NEXT: lsls r0, r0, #30
+; ARM-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: lsrs r1, r1, #2
+; ARM-NEXT: adds r1, r0, r1
+; ARM-NEXT: cmp r6, #0
+; ARM-NEXT: bne .LBB1_15
+; ARM-NEXT: b .LBB1_16
+; ARM-NEXT: .LBB1_12:
+; ARM-NEXT: ldr r6, [sp, #32] @ 4-byte Reload
+; ARM-NEXT: orrs r6, r7
+; ARM-NEXT: bne .LBB1_10
+; ARM-NEXT: .LBB1_13:
+; ARM-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: str r0, [sp, #32] @ 4-byte Spill
+; ARM-NEXT: cmp r3, #0
+; ARM-NEXT: beq .LBB1_11
+; ARM-NEXT: .LBB1_14:
+; ARM-NEXT: ldr r1, .LCPI1_0
+; ARM-NEXT: cmp r6, #0
+; ARM-NEXT: beq .LBB1_16
+; ARM-NEXT: .LBB1_15:
+; ARM-NEXT: lsls r1, r2, #31
+; ARM-NEXT: .LBB1_16:
+; ARM-NEXT: ldr r0, [sp, #32] @ 4-byte Reload
+; ARM-NEXT: add sp, #36
+; ARM-NEXT: pop {r4, r5, r6, r7, pc}
+; ARM-NEXT: .p2align 2
+; ARM-NEXT: @ %bb.17:
+; ARM-NEXT: .LCPI1_0:
+; ARM-NEXT: .long 2147483647 @ 0x7fffffff
+ %tmp = call i64 @llvm.smul.fix.sat.i64(i64 %x, i64 %y, i32 2)
+ ret i64 %tmp
+}
+
+define i4 @func3(i4 %x, i4 %y) nounwind {
+; ARM-LABEL: func3:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, lr}
+; ARM-NEXT: push {r4, lr}
+; ARM-NEXT: lsls r0, r0, #28
+; ARM-NEXT: asrs r4, r0, #31
+; ARM-NEXT: lsls r1, r1, #28
+; ARM-NEXT: asrs r2, r1, #28
+; ARM-NEXT: asrs r3, r1, #31
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: cmp r1, #1
+; ARM-NEXT: bgt .LBB2_2
+; ARM-NEXT: @ %bb.1:
+; ARM-NEXT: lsrs r0, r0, #2
+; ARM-NEXT: lsls r2, r1, #30
+; ARM-NEXT: adds r0, r2, r0
+; ARM-NEXT: b .LBB2_3
+; ARM-NEXT: .LBB2_2:
+; ARM-NEXT: ldr r0, .LCPI2_0
+; ARM-NEXT: .LBB2_3:
+; ARM-NEXT: movs r2, #1
+; ARM-NEXT: mvns r3, r2
+; ARM-NEXT: cmp r1, r3
+; ARM-NEXT: bge .LBB2_5
+; ARM-NEXT: @ %bb.4:
+; ARM-NEXT: lsls r0, r2, #31
+; ARM-NEXT: .LBB2_5:
+; ARM-NEXT: asrs r0, r0, #28
+; ARM-NEXT: pop {r4, pc}
+; ARM-NEXT: .p2align 2
+; ARM-NEXT: @ %bb.6:
+; ARM-NEXT: .LCPI2_0:
+; ARM-NEXT: .long 2147483647 @ 0x7fffffff
+ %tmp = call i4 @llvm.smul.fix.sat.i4(i4 %x, i4 %y, i32 2)
+ ret i4 %tmp
+}
+
+;; These result in regular integer multiplication with a saturation check.
+define i32 @func4(i32 %x, i32 %y) nounwind {
+; ARM-LABEL: func4:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r7, lr}
+; ARM-NEXT: push {r7, lr}
+; ARM-NEXT: mov r2, r1
+; ARM-NEXT: asrs r1, r0, #31
+; ARM-NEXT: asrs r3, r2, #31
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: cmp r1, #0
+; ARM-NEXT: bmi .LBB3_2
+; ARM-NEXT: @ %bb.1:
+; ARM-NEXT: ldr r2, .LCPI3_0
+; ARM-NEXT: b .LBB3_3
+; ARM-NEXT: .LBB3_2:
+; ARM-NEXT: movs r2, #1
+; ARM-NEXT: lsls r2, r2, #31
+; ARM-NEXT: .LBB3_3:
+; ARM-NEXT: asrs r3, r0, #31
+; ARM-NEXT: cmp r1, r3
+; ARM-NEXT: bne .LBB3_5
+; ARM-NEXT: @ %bb.4:
+; ARM-NEXT: mov r2, r0
+; ARM-NEXT: .LBB3_5:
+; ARM-NEXT: mov r0, r2
+; ARM-NEXT: pop {r7, pc}
+; ARM-NEXT: .p2align 2
+; ARM-NEXT: @ %bb.6:
+; ARM-NEXT: .LCPI3_0:
+; ARM-NEXT: .long 2147483647 @ 0x7fffffff
+ %tmp = call i32 @llvm.smul.fix.sat.i32(i32 %x, i32 %y, i32 0)
+ ret i32 %tmp
+}
+
+define i64 @func5(i64 %x, i64 %y) {
+; ARM-LABEL: func5:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, r5, r6, r7, lr}
+; ARM-NEXT: push {r4, r5, r6, r7, lr}
+; ARM-NEXT: .pad #28
+; ARM-NEXT: sub sp, #28
+; ARM-NEXT: str r3, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: mov r5, r2
+; ARM-NEXT: str r2, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: mov r6, r1
+; ARM-NEXT: movs r7, #0
+; ARM-NEXT: mov r4, r0
+; ARM-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; ARM-NEXT: mov r1, r7
+; ARM-NEXT: mov r3, r7
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: mov r0, r6
+; ARM-NEXT: mov r1, r7
+; ARM-NEXT: mov r2, r5
+; ARM-NEXT: mov r3, r7
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r5, r1
+; ARM-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: adcs r5, r7
+; ARM-NEXT: mov r0, r4
+; ARM-NEXT: mov r1, r7
+; ARM-NEXT: ldr r4, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: mov r2, r4
+; ARM-NEXT: mov r3, r7
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: ldr r2, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r2
+; ARM-NEXT: str r0, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: adcs r1, r7
+; ARM-NEXT: adds r0, r5, r1
+; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r5, r7
+; ARM-NEXT: adcs r5, r7
+; ARM-NEXT: mov r0, r6
+; ARM-NEXT: mov r1, r7
+; ARM-NEXT: mov r2, r4
+; ARM-NEXT: mov r3, r7
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r7, r1
+; ARM-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: adcs r7, r5
+; ARM-NEXT: asrs r2, r6, #31
+; ARM-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: mov r5, r4
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r3, r2
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r4, r0
+; ARM-NEXT: str r1, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: asrs r0, r5, #31
+; ARM-NEXT: mov r1, r0
+; ARM-NEXT: ldr r2, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: mov r3, r6
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r4
+; ARM-NEXT: ldr r4, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: adcs r1, r2
+; ARM-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: adds r3, r2, r0
+; ARM-NEXT: adcs r1, r7
+; ARM-NEXT: asrs r2, r4, #31
+; ARM-NEXT: eors r1, r2
+; ARM-NEXT: eors r3, r2
+; ARM-NEXT: orrs r3, r1
+; ARM-NEXT: eors r6, r5
+; ARM-NEXT: asrs r1, r6, #31
+; ARM-NEXT: cmp r3, #0
+; ARM-NEXT: bne .LBB4_3
+; ARM-NEXT: @ %bb.1:
+; ARM-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: cmp r3, #0
+; ARM-NEXT: beq .LBB4_4
+; ARM-NEXT: .LBB4_2:
+; ARM-NEXT: ldr r2, .LCPI4_0
+; ARM-NEXT: eors r1, r2
+; ARM-NEXT: add sp, #28
+; ARM-NEXT: pop {r4, r5, r6, r7, pc}
+; ARM-NEXT: .LBB4_3:
+; ARM-NEXT: mvns r0, r1
+; ARM-NEXT: cmp r3, #0
+; ARM-NEXT: bne .LBB4_2
+; ARM-NEXT: .LBB4_4:
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: add sp, #28
+; ARM-NEXT: pop {r4, r5, r6, r7, pc}
+; ARM-NEXT: .p2align 2
+; ARM-NEXT: @ %bb.5:
+; ARM-NEXT: .LCPI4_0:
+; ARM-NEXT: .long 2147483647 @ 0x7fffffff
+ %tmp = call i64 @llvm.smul.fix.sat.i64(i64 %x, i64 %y, i32 0)
+ ret i64 %tmp
+}
+
+define i4 @func6(i4 %x, i4 %y) nounwind {
+; ARM-LABEL: func6:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, lr}
+; ARM-NEXT: push {r4, lr}
+; ARM-NEXT: lsls r0, r0, #28
+; ARM-NEXT: asrs r4, r0, #31
+; ARM-NEXT: lsls r1, r1, #28
+; ARM-NEXT: asrs r2, r1, #28
+; ARM-NEXT: asrs r3, r1, #31
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: cmp r1, #0
+; ARM-NEXT: bmi .LBB5_2
+; ARM-NEXT: @ %bb.1:
+; ARM-NEXT: ldr r2, .LCPI5_0
+; ARM-NEXT: b .LBB5_3
+; ARM-NEXT: .LBB5_2:
+; ARM-NEXT: movs r2, #1
+; ARM-NEXT: lsls r2, r2, #31
+; ARM-NEXT: .LBB5_3:
+; ARM-NEXT: asrs r3, r0, #31
+; ARM-NEXT: cmp r1, r3
+; ARM-NEXT: bne .LBB5_5
+; ARM-NEXT: @ %bb.4:
+; ARM-NEXT: mov r2, r0
+; ARM-NEXT: .LBB5_5:
+; ARM-NEXT: asrs r0, r2, #28
+; ARM-NEXT: pop {r4, pc}
+; ARM-NEXT: .p2align 2
+; ARM-NEXT: @ %bb.6:
+; ARM-NEXT: .LCPI5_0:
+; ARM-NEXT: .long 2147483647 @ 0x7fffffff
+ %tmp = call i4 @llvm.smul.fix.sat.i4(i4 %x, i4 %y, i32 0)
+ ret i4 %tmp
+}
+
+define i64 @func7(i64 %x, i64 %y) nounwind {
+; ARM-LABEL: func7:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, r5, r6, r7, lr}
+; ARM-NEXT: push {r4, r5, r6, r7, lr}
+; ARM-NEXT: .pad #28
+; ARM-NEXT: sub sp, #28
+; ARM-NEXT: str r3, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: mov r5, r1
+; ARM-NEXT: str r1, [sp, #20] @ 4-byte Spill
+; ARM-NEXT: movs r6, #0
+; ARM-NEXT: mov r7, r0
+; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r4, r2
+; ARM-NEXT: str r2, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: mov r3, r6
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r0, r5
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r2, r4
+; ARM-NEXT: mov r3, r6
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r5, r1
+; ARM-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: adds r4, r0, r1
+; ARM-NEXT: adcs r5, r6
+; ARM-NEXT: mov r0, r7
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: ldr r7, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r3, r6
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r4
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: adcs r1, r6
+; ARM-NEXT: adds r0, r5, r1
+; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r4, r6
+; ARM-NEXT: adcs r4, r6
+; ARM-NEXT: ldr r5, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: mov r0, r5
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r3, r6
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r7, r1
+; ARM-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: adcs r7, r4
+; ARM-NEXT: asrs r2, r5, #31
+; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: ldr r5, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: mov r3, r2
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r4, r0
+; ARM-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: asrs r0, r5, #31
+; ARM-NEXT: mov r1, r0
+; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: ldr r3, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r2, r1
+; ARM-NEXT: adds r0, r0, r4
+; ARM-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: adcs r2, r1
+; ARM-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: adds r0, r1, r0
+; ARM-NEXT: adcs r2, r7
+; ARM-NEXT: rsbs r5, r2, #0
+; ARM-NEXT: adcs r5, r2
+; ARM-NEXT: movs r4, #1
+; ARM-NEXT: str r0, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: cmp r0, #0
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bmi .LBB6_2
+; ARM-NEXT: @ %bb.1:
+; ARM-NEXT: mov r3, r6
+; ARM-NEXT: .LBB6_2:
+; ARM-NEXT: ands r5, r3
+; ARM-NEXT: cmp r2, #0
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bgt .LBB6_4
+; ARM-NEXT: @ %bb.3:
+; ARM-NEXT: mov r3, r6
+; ARM-NEXT: .LBB6_4:
+; ARM-NEXT: orrs r3, r5
+; ARM-NEXT: mvns r4, r6
+; ARM-NEXT: cmp r3, #0
+; ARM-NEXT: mov r5, r4
+; ARM-NEXT: bne .LBB6_6
+; ARM-NEXT: @ %bb.5:
+; ARM-NEXT: ldr r5, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: .LBB6_6:
+; ARM-NEXT: adds r0, r2, #1
+; ARM-NEXT: rsbs r7, r0, #0
+; ARM-NEXT: adcs r7, r0
+; ARM-NEXT: ldr r0, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: cmp r0, #0
+; ARM-NEXT: mov r0, r1
+; ARM-NEXT: bge .LBB6_8
+; ARM-NEXT: @ %bb.7:
+; ARM-NEXT: mov r0, r6
+; ARM-NEXT: .LBB6_8:
+; ARM-NEXT: ands r7, r0
+; ARM-NEXT: cmp r2, r4
+; ARM-NEXT: mov r0, r1
+; ARM-NEXT: mov r2, r1
+; ARM-NEXT: bge .LBB6_12
+; ARM-NEXT: @ %bb.9:
+; ARM-NEXT: orrs r2, r7
+; ARM-NEXT: beq .LBB6_13
+; ARM-NEXT: .LBB6_10:
+; ARM-NEXT: cmp r3, #0
+; ARM-NEXT: bne .LBB6_14
+; ARM-NEXT: .LBB6_11:
+; ARM-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: cmp r2, #0
+; ARM-NEXT: bne .LBB6_15
+; ARM-NEXT: b .LBB6_16
+; ARM-NEXT: .LBB6_12:
+; ARM-NEXT: mov r2, r6
+; ARM-NEXT: orrs r2, r7
+; ARM-NEXT: bne .LBB6_10
+; ARM-NEXT: .LBB6_13:
+; ARM-NEXT: mov r6, r5
+; ARM-NEXT: cmp r3, #0
+; ARM-NEXT: beq .LBB6_11
+; ARM-NEXT: .LBB6_14:
+; ARM-NEXT: ldr r1, .LCPI6_0
+; ARM-NEXT: cmp r2, #0
+; ARM-NEXT: beq .LBB6_16
+; ARM-NEXT: .LBB6_15:
+; ARM-NEXT: lsls r1, r0, #31
+; ARM-NEXT: .LBB6_16:
+; ARM-NEXT: mov r0, r6
+; ARM-NEXT: add sp, #28
+; ARM-NEXT: pop {r4, r5, r6, r7, pc}
+; ARM-NEXT: .p2align 2
+; ARM-NEXT: @ %bb.17:
+; ARM-NEXT: .LCPI6_0:
+; ARM-NEXT: .long 2147483647 @ 0x7fffffff
+ %tmp = call i64 @llvm.smul.fix.sat.i64(i64 %x, i64 %y, i32 32)
+ ret i64 %tmp
+}
+
+define i64 @func8(i64 %x, i64 %y) nounwind {
+; ARM-LABEL: func8:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, r5, r6, r7, lr}
+; ARM-NEXT: push {r4, r5, r6, r7, lr}
+; ARM-NEXT: .pad #28
+; ARM-NEXT: sub sp, #28
+; ARM-NEXT: str r3, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: mov r5, r2
+; ARM-NEXT: str r2, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: mov r4, r1
+; ARM-NEXT: str r1, [sp, #20] @ 4-byte Spill
+; ARM-NEXT: movs r7, #0
+; ARM-NEXT: mov r6, r0
+; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: mov r1, r7
+; ARM-NEXT: mov r3, r7
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r0, r4
+; ARM-NEXT: mov r1, r7
+; ARM-NEXT: mov r2, r5
+; ARM-NEXT: mov r3, r7
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r5, r1
+; ARM-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: adds r4, r0, r1
+; ARM-NEXT: adcs r5, r7
+; ARM-NEXT: mov r0, r6
+; ARM-NEXT: mov r1, r7
+; ARM-NEXT: ldr r6, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: mov r2, r6
+; ARM-NEXT: mov r3, r7
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r4
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: adcs r1, r7
+; ARM-NEXT: adds r0, r5, r1
+; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r4, r7
+; ARM-NEXT: adcs r4, r7
+; ARM-NEXT: ldr r5, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: mov r0, r5
+; ARM-NEXT: mov r1, r7
+; ARM-NEXT: mov r2, r6
+; ARM-NEXT: mov r3, r7
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r6, r1
+; ARM-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: adcs r6, r4
+; ARM-NEXT: asrs r2, r5, #31
+; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: ldr r4, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r3, r2
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r5, r0
+; ARM-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: asrs r0, r4, #31
+; ARM-NEXT: mov r1, r0
+; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: ldr r3, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r5
+; ARM-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: adcs r1, r2
+; ARM-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: adds r0, r2, r0
+; ARM-NEXT: adcs r1, r6
+; ARM-NEXT: ldr r2, .LCPI7_0
+; ARM-NEXT: cmp r1, r2
+; ARM-NEXT: bgt .LBB7_2
+; ARM-NEXT: @ %bb.1:
+; ARM-NEXT: lsls r3, r0, #1
+; ARM-NEXT: ldr r4, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: lsrs r4, r4, #31
+; ARM-NEXT: adds r5, r3, r4
+; ARM-NEXT: b .LBB7_3
+; ARM-NEXT: .LBB7_2:
+; ARM-NEXT: mvns r5, r7
+; ARM-NEXT: .LBB7_3:
+; ARM-NEXT: movs r3, #3
+; ARM-NEXT: lsls r3, r3, #30
+; ARM-NEXT: cmp r1, r3
+; ARM-NEXT: blt .LBB7_5
+; ARM-NEXT: @ %bb.4:
+; ARM-NEXT: mov r7, r5
+; ARM-NEXT: .LBB7_5:
+; ARM-NEXT: cmp r1, r2
+; ARM-NEXT: bgt .LBB7_7
+; ARM-NEXT: @ %bb.6:
+; ARM-NEXT: lsls r2, r1, #1
+; ARM-NEXT: lsrs r0, r0, #31
+; ARM-NEXT: adds r2, r2, r0
+; ARM-NEXT: cmp r1, r3
+; ARM-NEXT: blt .LBB7_8
+; ARM-NEXT: b .LBB7_9
+; ARM-NEXT: .LBB7_7:
+; ARM-NEXT: ldr r2, .LCPI7_1
+; ARM-NEXT: cmp r1, r3
+; ARM-NEXT: bge .LBB7_9
+; ARM-NEXT: .LBB7_8:
+; ARM-NEXT: movs r0, #1
+; ARM-NEXT: lsls r2, r0, #31
+; ARM-NEXT: .LBB7_9:
+; ARM-NEXT: mov r0, r7
+; ARM-NEXT: mov r1, r2
+; ARM-NEXT: add sp, #28
+; ARM-NEXT: pop {r4, r5, r6, r7, pc}
+; ARM-NEXT: .p2align 2
+; ARM-NEXT: @ %bb.10:
+; ARM-NEXT: .LCPI7_0:
+; ARM-NEXT: .long 1073741823 @ 0x3fffffff
+; ARM-NEXT: .LCPI7_1:
+; ARM-NEXT: .long 2147483647 @ 0x7fffffff
+ %tmp = call i64 @llvm.smul.fix.sat.i64(i64 %x, i64 %y, i32 63)
+ ret i64 %tmp
+}
diff --git a/llvm/test/CodeGen/Thumb/umul_fix.ll b/llvm/test/CodeGen/Thumb/umul_fix.ll
new file mode 100644
index 000000000000000..7af5775c61d7bf4
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb/umul_fix.ll
@@ -0,0 +1,375 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=thumbv6m-none-unknown-eabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=ARM
+
+declare i4 @llvm.umul.fix.i4 (i4, i4, i32)
+declare i32 @llvm.umul.fix.i32 (i32, i32, i32)
+declare i64 @llvm.umul.fix.i64 (i64, i64, i32)
+
+define i32 @func(i32 %x, i32 %y) nounwind {
+; ARM-LABEL: func:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r7, lr}
+; ARM-NEXT: push {r7, lr}
+; ARM-NEXT: mov r2, r1
+; ARM-NEXT: movs r1, #0
+; ARM-NEXT: mov r3, r1
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: lsrs r0, r0, #2
+; ARM-NEXT: lsls r1, r1, #30
+; ARM-NEXT: adds r0, r1, r0
+; ARM-NEXT: pop {r7, pc}
+ %tmp = call i32 @llvm.umul.fix.i32(i32 %x, i32 %y, i32 2)
+ ret i32 %tmp
+}
+
+define i64 @func2(i64 %x, i64 %y) nounwind {
+; ARM-LABEL: func2:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, r5, r6, r7, lr}
+; ARM-NEXT: push {r4, r5, r6, r7, lr}
+; ARM-NEXT: .pad #28
+; ARM-NEXT: sub sp, #28
+; ARM-NEXT: str r3, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r4, r1
+; ARM-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: movs r6, #0
+; ARM-NEXT: mov r5, r0
+; ARM-NEXT: str r0, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r7, r2
+; ARM-NEXT: str r2, [sp, #20] @ 4-byte Spill
+; ARM-NEXT: mov r3, r6
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: str r0, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: str r1, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: mov r0, r4
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r3, r6
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r4, r1
+; ARM-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: adcs r4, r6
+; ARM-NEXT: mov r0, r5
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: mov r2, r5
+; ARM-NEXT: mov r3, r6
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r7, r1
+; ARM-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: adcs r7, r4
+; ARM-NEXT: ldr r4, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: mov r0, r4
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r2, r5
+; ARM-NEXT: mov r3, r6
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r7, r0, r7
+; ARM-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: mov r2, r6
+; ARM-NEXT: mov r3, r6
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r5, r0
+; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r2, r6
+; ARM-NEXT: mov r3, r6
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r5
+; ARM-NEXT: adds r0, r7, r0
+; ARM-NEXT: lsls r0, r0, #30
+; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: lsrs r1, r2, #2
+; ARM-NEXT: adds r1, r0, r1
+; ARM-NEXT: lsls r0, r2, #30
+; ARM-NEXT: ldr r2, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: lsrs r2, r2, #2
+; ARM-NEXT: adds r0, r0, r2
+; ARM-NEXT: add sp, #28
+; ARM-NEXT: pop {r4, r5, r6, r7, pc}
+ %tmp = call i64 @llvm.umul.fix.i64(i64 %x, i64 %y, i32 2)
+ ret i64 %tmp
+}
+
+define i4 @func3(i4 %x, i4 %y) nounwind {
+; ARM-LABEL: func3:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r7, lr}
+; ARM-NEXT: push {r7, lr}
+; ARM-NEXT: movs r2, #15
+; ARM-NEXT: ands r0, r2
+; ARM-NEXT: ands r2, r1
+; ARM-NEXT: movs r1, #0
+; ARM-NEXT: mov r3, r1
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: lsrs r0, r0, #2
+; ARM-NEXT: lsls r1, r1, #30
+; ARM-NEXT: adds r0, r1, r0
+; ARM-NEXT: pop {r7, pc}
+ %tmp = call i4 @llvm.umul.fix.i4(i4 %x, i4 %y, i32 2)
+ ret i4 %tmp
+}
+
+;; These result in regular integer multiplication
+define i32 @func4(i32 %x, i32 %y) nounwind {
+; ARM-LABEL: func4:
+; ARM: @ %bb.0:
+; ARM-NEXT: muls r0, r1, r0
+; ARM-NEXT: bx lr
+ %tmp = call i32 @llvm.umul.fix.i32(i32 %x, i32 %y, i32 0)
+ ret i32 %tmp
+}
+
+define i64 @func5(i64 %x, i64 %y) nounwind {
+; ARM-LABEL: func5:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r7, lr}
+; ARM-NEXT: push {r7, lr}
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: pop {r7, pc}
+ %tmp = call i64 @llvm.umul.fix.i64(i64 %x, i64 %y, i32 0)
+ ret i64 %tmp
+}
+
+define i4 @func6(i4 %x, i4 %y) nounwind {
+; ARM-LABEL: func6:
+; ARM: @ %bb.0:
+; ARM-NEXT: movs r2, #15
+; ARM-NEXT: ands r1, r2
+; ARM-NEXT: ands r0, r2
+; ARM-NEXT: muls r0, r1, r0
+; ARM-NEXT: bx lr
+ %tmp = call i4 @llvm.umul.fix.i4(i4 %x, i4 %y, i32 0)
+ ret i4 %tmp
+}
+
+define i64 @func7(i64 %x, i64 %y) nounwind {
+; ARM-LABEL: func7:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, r5, r6, r7, lr}
+; ARM-NEXT: push {r4, r5, r6, r7, lr}
+; ARM-NEXT: .pad #20
+; ARM-NEXT: sub sp, #20
+; ARM-NEXT: str r3, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: mov r7, r1
+; ARM-NEXT: str r1, [sp] @ 4-byte Spill
+; ARM-NEXT: movs r5, #0
+; ARM-NEXT: mov r4, r0
+; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: mov r6, r2
+; ARM-NEXT: str r2, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: mov r0, r7
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: mov r2, r6
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r7, r1
+; ARM-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: adcs r7, r5
+; ARM-NEXT: mov r0, r4
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: ldr r4, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: mov r2, r4
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r6, r1
+; ARM-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: adcs r6, r7
+; ARM-NEXT: ldr r7, [sp] @ 4-byte Reload
+; ARM-NEXT: mov r0, r7
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: mov r2, r4
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r6, r0, r6
+; ARM-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r2, r5
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r4, r0
+; ARM-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: mov r1, r7
+; ARM-NEXT: mov r2, r5
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r4
+; ARM-NEXT: adds r1, r6, r0
+; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: add sp, #20
+; ARM-NEXT: pop {r4, r5, r6, r7, pc}
+ %tmp = call i64 @llvm.umul.fix.i64(i64 %x, i64 %y, i32 32)
+ ret i64 %tmp
+}
+
+define i64 @func8(i64 %x, i64 %y) nounwind {
+; ARM-LABEL: func8:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, r5, r6, r7, lr}
+; ARM-NEXT: push {r4, r5, r6, r7, lr}
+; ARM-NEXT: .pad #28
+; ARM-NEXT: sub sp, #28
+; ARM-NEXT: str r3, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: mov r7, r2
+; ARM-NEXT: str r2, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: mov r4, r1
+; ARM-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: movs r5, #0
+; ARM-NEXT: mov r6, r0
+; ARM-NEXT: str r0, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: str r1, [sp, #20] @ 4-byte Spill
+; ARM-NEXT: mov r0, r4
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r7, r1
+; ARM-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: adds r4, r0, r1
+; ARM-NEXT: adcs r7, r5
+; ARM-NEXT: mov r0, r6
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: ldr r6, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: mov r2, r6
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r4
+; ARM-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; ARM-NEXT: adcs r1, r5
+; ARM-NEXT: adds r0, r7, r1
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: mov r4, r5
+; ARM-NEXT: adcs r4, r5
+; ARM-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: mov r0, r7
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: mov r2, r6
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r6, r1
+; ARM-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: adcs r6, r4
+; ARM-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: mov r2, r5
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r4, r0
+; ARM-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: mov r1, r7
+; ARM-NEXT: mov r2, r5
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r4
+; ARM-NEXT: ldr r2, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: adcs r1, r2
+; ARM-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: adds r0, r2, r0
+; ARM-NEXT: adcs r1, r6
+; ARM-NEXT: lsls r1, r1, #1
+; ARM-NEXT: lsrs r2, r0, #31
+; ARM-NEXT: adds r1, r1, r2
+; ARM-NEXT: lsls r0, r0, #1
+; ARM-NEXT: ldr r2, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: lsrs r2, r2, #31
+; ARM-NEXT: adds r0, r0, r2
+; ARM-NEXT: add sp, #28
+; ARM-NEXT: pop {r4, r5, r6, r7, pc}
+ %tmp = call i64 @llvm.umul.fix.i64(i64 %x, i64 %y, i32 63)
+ ret i64 %tmp
+}
+
+define i64 @func9(i64 %x, i64 %y) nounwind {
+; ARM-LABEL: func9:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, r5, r6, r7, lr}
+; ARM-NEXT: push {r4, r5, r6, r7, lr}
+; ARM-NEXT: .pad #20
+; ARM-NEXT: sub sp, #20
+; ARM-NEXT: str r3, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: mov r7, r2
+; ARM-NEXT: str r2, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: mov r4, r1
+; ARM-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: movs r5, #0
+; ARM-NEXT: mov r6, r0
+; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: str r1, [sp] @ 4-byte Spill
+; ARM-NEXT: mov r0, r4
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r7, r1
+; ARM-NEXT: ldr r1, [sp] @ 4-byte Reload
+; ARM-NEXT: adds r4, r0, r1
+; ARM-NEXT: adcs r7, r5
+; ARM-NEXT: mov r0, r6
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: ldr r6, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: mov r2, r6
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r4
+; ARM-NEXT: adcs r1, r5
+; ARM-NEXT: adds r0, r7, r1
+; ARM-NEXT: str r0, [sp] @ 4-byte Spill
+; ARM-NEXT: mov r4, r5
+; ARM-NEXT: adcs r4, r5
+; ARM-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: mov r0, r7
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: mov r2, r6
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r6, r1
+; ARM-NEXT: ldr r1, [sp] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp] @ 4-byte Spill
+; ARM-NEXT: adcs r6, r4
+; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: mov r2, r5
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r4, r0
+; ARM-NEXT: str r1, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: mov r1, r7
+; ARM-NEXT: mov r2, r5
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r4
+; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: adcs r1, r2
+; ARM-NEXT: ldr r2, [sp] @ 4-byte Reload
+; ARM-NEXT: adds r0, r2, r0
+; ARM-NEXT: adcs r1, r6
+; ARM-NEXT: add sp, #20
+; ARM-NEXT: pop {r4, r5, r6, r7, pc}
+ %tmp = call i64 @llvm.umul.fix.i64(i64 %x, i64 %y, i32 64)
+ ret i64 %tmp
+}
diff --git a/llvm/test/CodeGen/Thumb/umul_fix_sat.ll b/llvm/test/CodeGen/Thumb/umul_fix_sat.ll
new file mode 100644
index 000000000000000..fa88024315211b8
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb/umul_fix_sat.ll
@@ -0,0 +1,519 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=thumbv6m-none-unknown-eabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=ARM
+
+declare i4 @llvm.umul.fix.sat.i4 (i4, i4, i32)
+declare i32 @llvm.umul.fix.sat.i32 (i32, i32, i32)
+declare i64 @llvm.umul.fix.sat.i64 (i64, i64, i32)
+
+define i32 @func(i32 %x, i32 %y) nounwind {
+; ARM-LABEL: func:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, lr}
+; ARM-NEXT: push {r4, lr}
+; ARM-NEXT: mov r2, r1
+; ARM-NEXT: movs r4, #0
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: cmp r1, #3
+; ARM-NEXT: bhi .LBB0_2
+; ARM-NEXT: @ %bb.1:
+; ARM-NEXT: lsrs r0, r0, #2
+; ARM-NEXT: lsls r1, r1, #30
+; ARM-NEXT: adds r0, r1, r0
+; ARM-NEXT: pop {r4, pc}
+; ARM-NEXT: .LBB0_2:
+; ARM-NEXT: mvns r0, r4
+; ARM-NEXT: pop {r4, pc}
+ %tmp = call i32 @llvm.umul.fix.sat.i32(i32 %x, i32 %y, i32 2)
+ ret i32 %tmp
+}
+
+define i64 @func2(i64 %x, i64 %y) nounwind {
+; ARM-LABEL: func2:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, r5, r6, r7, lr}
+; ARM-NEXT: push {r4, r5, r6, r7, lr}
+; ARM-NEXT: .pad #28
+; ARM-NEXT: sub sp, #28
+; ARM-NEXT: str r3, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: mov r5, r1
+; ARM-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: movs r4, #0
+; ARM-NEXT: mov r6, r0
+; ARM-NEXT: str r0, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r7, r2
+; ARM-NEXT: str r2, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: str r0, [sp] @ 4-byte Spill
+; ARM-NEXT: str r1, [sp, #20] @ 4-byte Spill
+; ARM-NEXT: mov r0, r5
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r5, r1
+; ARM-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; ARM-NEXT: adcs r5, r4
+; ARM-NEXT: mov r0, r6
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: ldr r7, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: ldr r2, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r2
+; ARM-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; ARM-NEXT: adcs r1, r4
+; ARM-NEXT: adds r0, r5, r1
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: mov r6, r4
+; ARM-NEXT: adcs r6, r4
+; ARM-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: mov r0, r5
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r7, r1
+; ARM-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: adcs r7, r6
+; ARM-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: mov r2, r4
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r6, r0
+; ARM-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: mov r2, r4
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r6
+; ARM-NEXT: ldr r2, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: adcs r1, r2
+; ARM-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: adds r2, r2, r0
+; ARM-NEXT: adcs r1, r7
+; ARM-NEXT: lsrs r3, r2, #2
+; ARM-NEXT: orrs r3, r1
+; ARM-NEXT: mvns r1, r4
+; ARM-NEXT: cmp r3, #0
+; ARM-NEXT: mov r0, r1
+; ARM-NEXT: beq .LBB1_3
+; ARM-NEXT: @ %bb.1:
+; ARM-NEXT: cmp r3, #0
+; ARM-NEXT: beq .LBB1_4
+; ARM-NEXT: .LBB1_2:
+; ARM-NEXT: add sp, #28
+; ARM-NEXT: pop {r4, r5, r6, r7, pc}
+; ARM-NEXT: .LBB1_3:
+; ARM-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: lsls r0, r0, #30
+; ARM-NEXT: ldr r4, [sp] @ 4-byte Reload
+; ARM-NEXT: lsrs r4, r4, #2
+; ARM-NEXT: adds r0, r0, r4
+; ARM-NEXT: cmp r3, #0
+; ARM-NEXT: bne .LBB1_2
+; ARM-NEXT: .LBB1_4:
+; ARM-NEXT: lsls r1, r2, #30
+; ARM-NEXT: ldr r2, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: lsrs r2, r2, #2
+; ARM-NEXT: adds r1, r1, r2
+; ARM-NEXT: add sp, #28
+; ARM-NEXT: pop {r4, r5, r6, r7, pc}
+ %tmp = call i64 @llvm.umul.fix.sat.i64(i64 %x, i64 %y, i32 2)
+ ret i64 %tmp
+}
+
+define i4 @func3(i4 %x, i4 %y) nounwind {
+; ARM-LABEL: func3:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, lr}
+; ARM-NEXT: push {r4, lr}
+; ARM-NEXT: movs r2, #15
+; ARM-NEXT: ands r2, r1
+; ARM-NEXT: lsls r0, r0, #28
+; ARM-NEXT: movs r4, #0
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: cmp r1, #3
+; ARM-NEXT: bhi .LBB2_2
+; ARM-NEXT: @ %bb.1:
+; ARM-NEXT: lsrs r0, r0, #2
+; ARM-NEXT: lsls r1, r1, #30
+; ARM-NEXT: adds r0, r1, r0
+; ARM-NEXT: lsrs r0, r0, #28
+; ARM-NEXT: pop {r4, pc}
+; ARM-NEXT: .LBB2_2:
+; ARM-NEXT: mvns r0, r4
+; ARM-NEXT: lsrs r0, r0, #28
+; ARM-NEXT: pop {r4, pc}
+ %tmp = call i4 @llvm.umul.fix.sat.i4(i4 %x, i4 %y, i32 2)
+ ret i4 %tmp
+}
+
+;; These result in regular integer multiplication with a saturation check.
+define i32 @func4(i32 %x, i32 %y) nounwind {
+; ARM-LABEL: func4:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, lr}
+; ARM-NEXT: push {r4, lr}
+; ARM-NEXT: mov r2, r1
+; ARM-NEXT: movs r4, #0
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: cmp r1, #0
+; ARM-NEXT: bls .LBB3_2
+; ARM-NEXT: @ %bb.1:
+; ARM-NEXT: mvns r0, r4
+; ARM-NEXT: .LBB3_2:
+; ARM-NEXT: pop {r4, pc}
+ %tmp = call i32 @llvm.umul.fix.sat.i32(i32 %x, i32 %y, i32 0)
+ ret i32 %tmp
+}
+
+define i64 @func5(i64 %x, i64 %y) {
+; ARM-LABEL: func5:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, r5, r6, r7, lr}
+; ARM-NEXT: push {r4, r5, r6, r7, lr}
+; ARM-NEXT: .pad #12
+; ARM-NEXT: sub sp, #12
+; ARM-NEXT: mov r6, r3
+; ARM-NEXT: str r2, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r4, r1
+; ARM-NEXT: mov r2, r0
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: movs r5, #0
+; ARM-NEXT: mov r0, r3
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: str r0, [sp] @ 4-byte Spill
+; ARM-NEXT: mov r7, r1
+; ARM-NEXT: subs r0, r1, #1
+; ARM-NEXT: sbcs r7, r0
+; ARM-NEXT: mov r0, r4
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: subs r2, r1, #1
+; ARM-NEXT: sbcs r1, r2
+; ARM-NEXT: subs r2, r6, #1
+; ARM-NEXT: sbcs r6, r2
+; ARM-NEXT: subs r2, r4, #1
+; ARM-NEXT: sbcs r4, r2
+; ARM-NEXT: ands r4, r6
+; ARM-NEXT: orrs r4, r1
+; ARM-NEXT: orrs r4, r7
+; ARM-NEXT: ldr r1, [sp] @ 4-byte Reload
+; ARM-NEXT: adds r6, r0, r1
+; ARM-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r3, r1, r6
+; ARM-NEXT: mov r6, r5
+; ARM-NEXT: adcs r6, r5
+; ARM-NEXT: orrs r6, r4
+; ARM-NEXT: mvns r1, r5
+; ARM-NEXT: cmp r6, #0
+; ARM-NEXT: mov r2, r1
+; ARM-NEXT: bne .LBB4_2
+; ARM-NEXT: @ %bb.1:
+; ARM-NEXT: mov r2, r0
+; ARM-NEXT: .LBB4_2:
+; ARM-NEXT: cmp r6, #0
+; ARM-NEXT: bne .LBB4_4
+; ARM-NEXT: @ %bb.3:
+; ARM-NEXT: mov r1, r3
+; ARM-NEXT: .LBB4_4:
+; ARM-NEXT: mov r0, r2
+; ARM-NEXT: add sp, #12
+; ARM-NEXT: pop {r4, r5, r6, r7, pc}
+ %tmp = call i64 @llvm.umul.fix.sat.i64(i64 %x, i64 %y, i32 0)
+ ret i64 %tmp
+}
+
+define i4 @func6(i4 %x, i4 %y) nounwind {
+; ARM-LABEL: func6:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, lr}
+; ARM-NEXT: push {r4, lr}
+; ARM-NEXT: movs r2, #15
+; ARM-NEXT: ands r2, r1
+; ARM-NEXT: lsls r0, r0, #28
+; ARM-NEXT: movs r4, #0
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: cmp r1, #0
+; ARM-NEXT: bls .LBB5_2
+; ARM-NEXT: @ %bb.1:
+; ARM-NEXT: mvns r0, r4
+; ARM-NEXT: .LBB5_2:
+; ARM-NEXT: lsrs r0, r0, #28
+; ARM-NEXT: pop {r4, pc}
+ %tmp = call i4 @llvm.umul.fix.sat.i4(i4 %x, i4 %y, i32 0)
+ ret i4 %tmp
+}
+
+define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
+; ARM-LABEL: vec2:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, r5, r6, r7, lr}
+; ARM-NEXT: push {r4, r5, r6, r7, lr}
+; ARM-NEXT: .pad #12
+; ARM-NEXT: sub sp, #12
+; ARM-NEXT: str r3, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r7, r2
+; ARM-NEXT: mov r5, r1
+; ARM-NEXT: ldr r2, [sp, #32]
+; ARM-NEXT: movs r6, #0
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r3, r6
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mvns r4, r6
+; ARM-NEXT: cmp r1, #0
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: bhi .LBB6_2
+; ARM-NEXT: @ %bb.1:
+; ARM-NEXT: mov r1, r0
+; ARM-NEXT: .LBB6_2:
+; ARM-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: ldr r2, [sp, #36]
+; ARM-NEXT: mov r0, r5
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r3, r6
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: cmp r1, #0
+; ARM-NEXT: mov r5, r4
+; ARM-NEXT: bhi .LBB6_4
+; ARM-NEXT: @ %bb.3:
+; ARM-NEXT: mov r5, r0
+; ARM-NEXT: .LBB6_4:
+; ARM-NEXT: ldr r2, [sp, #40]
+; ARM-NEXT: mov r0, r7
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r3, r6
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: cmp r1, #0
+; ARM-NEXT: mov r7, r4
+; ARM-NEXT: bhi .LBB6_6
+; ARM-NEXT: @ %bb.5:
+; ARM-NEXT: mov r7, r0
+; ARM-NEXT: .LBB6_6:
+; ARM-NEXT: ldr r2, [sp, #44]
+; ARM-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r3, r6
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: cmp r1, #0
+; ARM-NEXT: bhi .LBB6_8
+; ARM-NEXT: @ %bb.7:
+; ARM-NEXT: mov r4, r0
+; ARM-NEXT: .LBB6_8:
+; ARM-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: mov r1, r5
+; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: add sp, #12
+; ARM-NEXT: pop {r4, r5, r6, r7, pc}
+ %tmp = call <4 x i32> @llvm.umul.fix.sat.v4i32(<4 x i32> %x, <4 x i32> %y, i32 0)
+ ret <4 x i32> %tmp
+}
+
+define i64 @func7(i64 %x, i64 %y) nounwind {
+; ARM-LABEL: func7:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, r5, r6, r7, lr}
+; ARM-NEXT: push {r4, r5, r6, r7, lr}
+; ARM-NEXT: .pad #28
+; ARM-NEXT: sub sp, #28
+; ARM-NEXT: str r3, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: mov r7, r2
+; ARM-NEXT: str r2, [sp, #20] @ 4-byte Spill
+; ARM-NEXT: mov r5, r1
+; ARM-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: movs r4, #0
+; ARM-NEXT: mov r6, r0
+; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r0, r5
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r7, r1
+; ARM-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: adds r5, r0, r1
+; ARM-NEXT: adcs r7, r4
+; ARM-NEXT: mov r0, r6
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: ldr r6, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: mov r2, r6
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r5
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: adcs r1, r4
+; ARM-NEXT: adds r0, r7, r1
+; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r5, r4
+; ARM-NEXT: adcs r5, r4
+; ARM-NEXT: ldr r7, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: mov r0, r7
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r2, r6
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r6, r1
+; ARM-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: adcs r6, r5
+; ARM-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: mov r2, r4
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r5, r0
+; ARM-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: mov r1, r7
+; ARM-NEXT: mov r2, r4
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r5
+; ARM-NEXT: ldr r2, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: adcs r1, r2
+; ARM-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: adds r3, r2, r0
+; ARM-NEXT: adcs r1, r6
+; ARM-NEXT: mvns r2, r4
+; ARM-NEXT: cmp r1, #0
+; ARM-NEXT: mov r0, r2
+; ARM-NEXT: bne .LBB7_2
+; ARM-NEXT: @ %bb.1:
+; ARM-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: .LBB7_2:
+; ARM-NEXT: cmp r1, #0
+; ARM-NEXT: bne .LBB7_4
+; ARM-NEXT: @ %bb.3:
+; ARM-NEXT: mov r2, r3
+; ARM-NEXT: .LBB7_4:
+; ARM-NEXT: mov r1, r2
+; ARM-NEXT: add sp, #28
+; ARM-NEXT: pop {r4, r5, r6, r7, pc}
+ %tmp = call i64 @llvm.umul.fix.sat.i64(i64 %x, i64 %y, i32 32)
+ ret i64 %tmp
+}
+
+define i64 @func8(i64 %x, i64 %y) nounwind {
+; ARM-LABEL: func8:
+; ARM: @ %bb.0:
+; ARM-NEXT: .save {r4, r5, r6, r7, lr}
+; ARM-NEXT: push {r4, r5, r6, r7, lr}
+; ARM-NEXT: .pad #28
+; ARM-NEXT: sub sp, #28
+; ARM-NEXT: str r3, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: mov r7, r2
+; ARM-NEXT: str r2, [sp, #20] @ 4-byte Spill
+; ARM-NEXT: mov r5, r1
+; ARM-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: movs r4, #0
+; ARM-NEXT: mov r6, r0
+; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r0, r5
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r7, r1
+; ARM-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: adds r5, r0, r1
+; ARM-NEXT: adcs r7, r4
+; ARM-NEXT: mov r0, r6
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: ldr r6, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: mov r2, r6
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r5
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: adcs r1, r4
+; ARM-NEXT: adds r0, r7, r1
+; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r5, r4
+; ARM-NEXT: adcs r5, r4
+; ARM-NEXT: ldr r7, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: mov r0, r7
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r2, r6
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r6, r1
+; ARM-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: adcs r6, r5
+; ARM-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: mov r2, r4
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r5, r0
+; ARM-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: mov r1, r7
+; ARM-NEXT: mov r2, r4
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r5
+; ARM-NEXT: ldr r2, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: adcs r1, r2
+; ARM-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: adds r5, r2, r0
+; ARM-NEXT: adcs r1, r6
+; ARM-NEXT: lsrs r3, r5, #31
+; ARM-NEXT: mvns r2, r4
+; ARM-NEXT: cmp r3, #0
+; ARM-NEXT: mov r0, r2
+; ARM-NEXT: bne .LBB8_2
+; ARM-NEXT: @ %bb.1:
+; ARM-NEXT: lsls r0, r5, #1
+; ARM-NEXT: ldr r4, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: lsrs r4, r4, #31
+; ARM-NEXT: adds r0, r0, r4
+; ARM-NEXT: .LBB8_2:
+; ARM-NEXT: cmp r3, #0
+; ARM-NEXT: bne .LBB8_4
+; ARM-NEXT: @ %bb.3:
+; ARM-NEXT: lsls r1, r1, #1
+; ARM-NEXT: adds r2, r1, r3
+; ARM-NEXT: .LBB8_4:
+; ARM-NEXT: mov r1, r2
+; ARM-NEXT: add sp, #28
+; ARM-NEXT: pop {r4, r5, r6, r7, pc}
+ %tmp = call i64 @llvm.umul.fix.sat.i64(i64 %x, i64 %y, i32 63)
+ ret i64 %tmp
+}
More information about the llvm-commits
mailing list