[llvm] [SelectionDAG] Expand fixed point multiplication into libcall (PR #79352)

Wed Jan 24 11:58:18 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-selectiondag

Author: None (PiJoules)

<details>
<summary>Changes</summary>

32-bit ARMv6 with thumb doesn't support MULHS/MUL_LOHI as legal/custom nodes during expansion which will cause fixed point multiplication of _Accum types to fail with fixed point arithmetic. Prior to this, we just happen to use fixed point multiplication on platforms that happen to support these MULHS/MUL_LOHI.

This patch attempts to check if the multiplication can be done via libcalls, which are provided by the arm runtime.

---

Patch is 73.06 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/79352.diff


7 Files Affected:

- (modified) llvm/include/llvm/CodeGen/TargetLowering.h (+17) 
- (modified) llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp (+9-40) 
- (modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+117-64) 
- (added) llvm/test/CodeGen/Thumb/smul_fix.ll (+304) 
- (added) llvm/test/CodeGen/Thumb/smul_fix_sat.ll (+690) 
- (added) llvm/test/CodeGen/Thumb/umul_fix.ll (+375) 
- (added) llvm/test/CodeGen/Thumb/umul_fix_sat.ll (+519) 


``````````diff

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index c9492b4cf778b6..b29be069699f0c 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5287,6 +5287,23 @@ class TargetLowering : public TargetLoweringBase {
   bool expandMULO(SDNode *Node, SDValue &Result, SDValue &Overflow,
                   SelectionDAG &DAG) const;
 
+  /// ForceExpandMUL - Unconditionally expand a MUL into either a libcall or
+  /// brute force involving many multiplications. The expansion works by
+  /// attempting to do a multiplication on a wider type twice the size of the
+  /// original operands. LL and LH represent the lower and upper halves of the
+  /// first operand. RL and RH represent the lower and upper halves of the
+  /// second operand. The upper and lower halves of the result are stored in Lo
+  /// and Hi.
+  void ForceExpandMUL(SelectionDAG &DAG, SDLoc dl, bool Signed, EVT WideVT,
+                      const SDValue LL, const SDValue LH, const SDValue RL,
+                      const SDValue RH, SDValue &Lo, SDValue &Hi) const;
+
+  /// Same as above, but creates the upper halves of each operand by
+  /// sign/zero-extending the operands.
+  void ForceExpandMUL(SelectionDAG &DAG, SDLoc dl, bool Signed,
+                      const SDValue LHS, const SDValue RHS, SDValue &Lo,
+                      SDValue &Hi) const;
+
   /// Expand a VECREDUCE_* into an explicit calculation. If Count is specified,
   /// only the first Count elements of the vector are used.
   SDValue expandVecReduce(SDNode *Node, SelectionDAG &DAG) const;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 814f746f5a4d9d..e9f7e30863733c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4008,44 +4008,7 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
     LC = RTLIB::MUL_I128;
 
   if (LC == RTLIB::UNKNOWN_LIBCALL || !TLI.getLibcallName(LC)) {
-    // We'll expand the multiplication by brute force because we have no other
-    // options. This is a trivially-generalized version of the code from
-    // Hacker's Delight (itself derived from Knuth's Algorithm M from section
-    // 4.3.1).
-    unsigned Bits = NVT.getSizeInBits();
-    unsigned HalfBits = Bits >> 1;
-    SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl,
-                                   NVT);
-    SDValue LLL = DAG.getNode(ISD::AND, dl, NVT, LL, Mask);
-    SDValue RLL = DAG.getNode(ISD::AND, dl, NVT, RL, Mask);
-
-    SDValue T = DAG.getNode(ISD::MUL, dl, NVT, LLL, RLL);
-    SDValue TL = DAG.getNode(ISD::AND, dl, NVT, T, Mask);
-
-    SDValue Shift = DAG.getShiftAmountConstant(HalfBits, NVT, dl);
-    SDValue TH = DAG.getNode(ISD::SRL, dl, NVT, T, Shift);
-    SDValue LLH = DAG.getNode(ISD::SRL, dl, NVT, LL, Shift);
-    SDValue RLH = DAG.getNode(ISD::SRL, dl, NVT, RL, Shift);
-
-    SDValue U = DAG.getNode(ISD::ADD, dl, NVT,
-                            DAG.getNode(ISD::MUL, dl, NVT, LLH, RLL), TH);
-    SDValue UL = DAG.getNode(ISD::AND, dl, NVT, U, Mask);
-    SDValue UH = DAG.getNode(ISD::SRL, dl, NVT, U, Shift);
-
-    SDValue V = DAG.getNode(ISD::ADD, dl, NVT,
-                            DAG.getNode(ISD::MUL, dl, NVT, LLL, RLH), UL);
-    SDValue VH = DAG.getNode(ISD::SRL, dl, NVT, V, Shift);
-
-    SDValue W = DAG.getNode(ISD::ADD, dl, NVT,
-                            DAG.getNode(ISD::MUL, dl, NVT, LLH, RLH),
-                            DAG.getNode(ISD::ADD, dl, NVT, UH, VH));
-    Lo = DAG.getNode(ISD::ADD, dl, NVT, TL,
-                     DAG.getNode(ISD::SHL, dl, NVT, V, Shift));
-
-    Hi = DAG.getNode(ISD::ADD, dl, NVT, W,
-                     DAG.getNode(ISD::ADD, dl, NVT,
-                                 DAG.getNode(ISD::MUL, dl, NVT, RH, LL),
-                                 DAG.getNode(ISD::MUL, dl, NVT, RL, LH)));
+    TLI.ForceExpandMUL(DAG, dl, /*Signed=*/true, VT, LL, LH, RL, RH, Lo, Hi);
     return;
   }
 
@@ -4146,9 +4109,15 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
   if (!TLI.expandMUL_LOHI(LoHiOp, VT, dl, LHS, RHS, Result, NVT, DAG,
                           TargetLowering::MulExpansionKind::OnlyLegalOrCustom,
                           LL, LH, RL, RH)) {
-    report_fatal_error("Unable to expand MUL_FIX using MUL_LOHI.");
-    return;
+    Result.clear();
+    Result.resize(4);
+
+    SDValue LoTmp, HiTmp;
+    TLI.ForceExpandMUL(DAG, dl, Signed, LHS, RHS, LoTmp, HiTmp);
+    SplitInteger(LoTmp, Result[0], Result[1]);
+    SplitInteger(HiTmp, Result[2], Result[3]);
   }
+  assert(Result.size() == 4 && "Unexpected number of partlets in the result");
 
   unsigned NVTSize = NVT.getScalarSizeInBits();
   assert((VTSize == NVTSize * 2) && "Expected the new value type to be half "
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index b8ed02e268b184..2fe1dd4ce15e16 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10149,6 +10149,121 @@ SDValue TargetLowering::expandShlSat(SDNode *Node, SelectionDAG &DAG) const {
   return DAG.getSelect(dl, VT, Cond, SatVal, Result);
 }
 
+void TargetLowering::ForceExpandMUL(SelectionDAG &DAG, SDLoc dl, bool Signed,
+                                    EVT WideVT, const SDValue LL,
+                                    const SDValue LH, const SDValue RL,
+                                    const SDValue RH, SDValue &Lo,
+                                    SDValue &Hi) const {
+  // We can fall back to a libcall with an illegal type for the MUL if we
+  // have a libcall big enough.
+  // Also, we can fall back to a division in some cases, but that's a big
+  // performance hit in the general case.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (WideVT == MVT::i16)
+    LC = RTLIB::MUL_I16;
+  else if (WideVT == MVT::i32)
+    LC = RTLIB::MUL_I32;
+  else if (WideVT == MVT::i64)
+    LC = RTLIB::MUL_I64;
+  else if (WideVT == MVT::i128)
+    LC = RTLIB::MUL_I128;
+
+  if (LC == RTLIB::UNKNOWN_LIBCALL || !getLibcallName(LC)) {
+    // We'll expand the multiplication by brute force because we have no other
+    // options. This is a trivially-generalized version of the code from
+    // Hacker's Delight (itself derived from Knuth's Algorithm M from section
+    // 4.3.1).
+    EVT VT = LL.getValueType();
+    unsigned Bits = VT.getSizeInBits();
+    unsigned HalfBits = Bits >> 1;
+    SDValue Mask =
+        DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl, VT);
+    SDValue LLL = DAG.getNode(ISD::AND, dl, VT, LL, Mask);
+    SDValue RLL = DAG.getNode(ISD::AND, dl, VT, RL, Mask);
+
+    SDValue T = DAG.getNode(ISD::MUL, dl, VT, LLL, RLL);
+    SDValue TL = DAG.getNode(ISD::AND, dl, VT, T, Mask);
+
+    SDValue Shift = DAG.getShiftAmountConstant(HalfBits, VT, dl);
+    SDValue TH = DAG.getNode(ISD::SRL, dl, VT, T, Shift);
+    SDValue LLH = DAG.getNode(ISD::SRL, dl, VT, LL, Shift);
+    SDValue RLH = DAG.getNode(ISD::SRL, dl, VT, RL, Shift);
+
+    SDValue U = DAG.getNode(ISD::ADD, dl, VT,
+                            DAG.getNode(ISD::MUL, dl, VT, LLH, RLL), TH);
+    SDValue UL = DAG.getNode(ISD::AND, dl, VT, U, Mask);
+    SDValue UH = DAG.getNode(ISD::SRL, dl, VT, U, Shift);
+
+    SDValue V = DAG.getNode(ISD::ADD, dl, VT,
+                            DAG.getNode(ISD::MUL, dl, VT, LLL, RLH), UL);
+    SDValue VH = DAG.getNode(ISD::SRL, dl, VT, V, Shift);
+
+    SDValue W =
+        DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LLH, RLH),
+                    DAG.getNode(ISD::ADD, dl, VT, UH, VH));
+    Lo = DAG.getNode(ISD::ADD, dl, VT, TL,
+                     DAG.getNode(ISD::SHL, dl, VT, V, Shift));
+
+    Hi = DAG.getNode(ISD::ADD, dl, VT, W,
+                     DAG.getNode(ISD::ADD, dl, VT,
+                                 DAG.getNode(ISD::MUL, dl, VT, RH, LL),
+                                 DAG.getNode(ISD::MUL, dl, VT, RL, LH)));
+  } else {
+    // Attempt a libcall.
+    SDValue Ret;
+    TargetLowering::MakeLibCallOptions CallOptions;
+    CallOptions.setSExt(Signed);
+    CallOptions.setIsPostTypeLegalization(true);
+    if (shouldSplitFunctionArgumentsAsLittleEndian(DAG.getDataLayout())) {
+      // Halves of WideVT are packed into registers in different order
+      // depending on platform endianness. This is usually handled by
+      // the C calling convention, but we can't defer to it in
+      // the legalizer.
+      SDValue Args[] = {LL, LH, RL, RH};
+      Ret = makeLibCall(DAG, LC, WideVT, Args, CallOptions, dl).first;
+    } else {
+      SDValue Args[] = {LH, LL, RH, RL};
+      Ret = makeLibCall(DAG, LC, WideVT, Args, CallOptions, dl).first;
+    }
+    assert(Ret.getOpcode() == ISD::MERGE_VALUES &&
+           "Ret value is a collection of constituent nodes holding result.");
+    if (DAG.getDataLayout().isLittleEndian()) {
+      // Same as above.
+      Lo = Ret.getOperand(0);
+      Hi = Ret.getOperand(1);
+    } else {
+      Lo = Ret.getOperand(1);
+      Hi = Ret.getOperand(0);
+    }
+  }
+}
+
+void TargetLowering::ForceExpandMUL(SelectionDAG &DAG, SDLoc dl, bool Signed,
+                                    const SDValue LHS, const SDValue RHS,
+                                    SDValue &Lo, SDValue &Hi) const {
+  EVT VT = LHS.getValueType();
+  assert(RHS.getValueType() == VT && "Mismatching operand types");
+
+  SDValue HiLHS;
+  SDValue HiRHS;
+  if (Signed) {
+    // The high part is obtained by SRA'ing all but one of the bits of low
+    // part.
+    unsigned LoSize = VT.getFixedSizeInBits();
+    HiLHS = DAG.getNode(
+        ISD::SRA, dl, VT, LHS,
+        DAG.getConstant(LoSize - 1, dl, getPointerTy(DAG.getDataLayout())));
+    HiRHS = DAG.getNode(
+        ISD::SRA, dl, VT, RHS,
+        DAG.getConstant(LoSize - 1, dl, getPointerTy(DAG.getDataLayout())));
+  } else {
+    HiLHS = DAG.getConstant(0, dl, VT);
+    HiRHS = DAG.getConstant(0, dl, VT);
+  }
+  EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() * 2);
+  ForceExpandMUL(DAG, dl, Signed, WideVT, LHS, HiLHS, RHS, HiRHS, Lo, Hi);
+}
+
 SDValue
 TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
   assert((Node->getOpcode() == ISD::SMULFIX ||
@@ -10223,7 +10338,7 @@ TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
   } else if (VT.isVector()) {
     return SDValue();
   } else {
-    report_fatal_error("Unable to expand fixed point multiplication.");
+    ForceExpandMUL(DAG, dl, Signed, LHS, RHS, Lo, Hi);
   }
 
   if (Scale == VTSize)
@@ -10522,69 +10637,7 @@ bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result,
     if (VT.isVector())
       return false;
 
-    // We can fall back to a libcall with an illegal type for the MUL if we
-    // have a libcall big enough.
-    // Also, we can fall back to a division in some cases, but that's a big
-    // performance hit in the general case.
-    RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
-    if (WideVT == MVT::i16)
-      LC = RTLIB::MUL_I16;
-    else if (WideVT == MVT::i32)
-      LC = RTLIB::MUL_I32;
-    else if (WideVT == MVT::i64)
-      LC = RTLIB::MUL_I64;
-    else if (WideVT == MVT::i128)
-      LC = RTLIB::MUL_I128;
-    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Cannot expand this operation!");
-
-    SDValue HiLHS;
-    SDValue HiRHS;
-    if (isSigned) {
-      // The high part is obtained by SRA'ing all but one of the bits of low
-      // part.
-      unsigned LoSize = VT.getFixedSizeInBits();
-      HiLHS =
-          DAG.getNode(ISD::SRA, dl, VT, LHS,
-                      DAG.getConstant(LoSize - 1, dl,
-                                      getPointerTy(DAG.getDataLayout())));
-      HiRHS =
-          DAG.getNode(ISD::SRA, dl, VT, RHS,
-                      DAG.getConstant(LoSize - 1, dl,
-                                      getPointerTy(DAG.getDataLayout())));
-    } else {
-        HiLHS = DAG.getConstant(0, dl, VT);
-        HiRHS = DAG.getConstant(0, dl, VT);
-    }
-
-    // Here we're passing the 2 arguments explicitly as 4 arguments that are
-    // pre-lowered to the correct types. This all depends upon WideVT not
-    // being a legal type for the architecture and thus has to be split to
-    // two arguments.
-    SDValue Ret;
-    TargetLowering::MakeLibCallOptions CallOptions;
-    CallOptions.setSExt(isSigned);
-    CallOptions.setIsPostTypeLegalization(true);
-    if (shouldSplitFunctionArgumentsAsLittleEndian(DAG.getDataLayout())) {
-      // Halves of WideVT are packed into registers in different order
-      // depending on platform endianness. This is usually handled by
-      // the C calling convention, but we can't defer to it in
-      // the legalizer.
-      SDValue Args[] = { LHS, HiLHS, RHS, HiRHS };
-      Ret = makeLibCall(DAG, LC, WideVT, Args, CallOptions, dl).first;
-    } else {
-      SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
-      Ret = makeLibCall(DAG, LC, WideVT, Args, CallOptions, dl).first;
-    }
-    assert(Ret.getOpcode() == ISD::MERGE_VALUES &&
-           "Ret value is a collection of constituent nodes holding result.");
-    if (DAG.getDataLayout().isLittleEndian()) {
-      // Same as above.
-      BottomHalf = Ret.getOperand(0);
-      TopHalf = Ret.getOperand(1);
-    } else {
-      BottomHalf = Ret.getOperand(1);
-      TopHalf = Ret.getOperand(0);
-    }
+    ForceExpandMUL(DAG, dl, isSigned, LHS, RHS, BottomHalf, TopHalf);
   }
 
   Result = BottomHalf;
diff --git a/llvm/test/CodeGen/Thumb/smul_fix.ll b/llvm/test/CodeGen/Thumb/smul_fix.ll
new file mode 100644
index 00000000000000..52f241802b87e3
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb/smul_fix.ll
@@ -0,0 +1,304 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=thumbv6m-none-unknown-eabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=ARM
+
+declare  i4  @llvm.smul.fix.i4   (i4,  i4, i32)
+declare  i32 @llvm.smul.fix.i32  (i32, i32, i32)
+declare  i64 @llvm.smul.fix.i64  (i64, i64, i32)
+
+define i32 @func(i32 %x, i32 %y) nounwind {
+; ARM-LABEL: func:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    .save {r7, lr}
+; ARM-NEXT:    push {r7, lr}
+; ARM-NEXT:    mov r2, r1
+; ARM-NEXT:    asrs r1, r0, #31
+; ARM-NEXT:    asrs r3, r2, #31
+; ARM-NEXT:    bl __aeabi_lmul
+; ARM-NEXT:    lsrs r0, r0, #2
+; ARM-NEXT:    lsls r1, r1, #30
+; ARM-NEXT:    adds r0, r1, r0
+; ARM-NEXT:    pop {r7, pc}
+  %tmp = call i32 @llvm.smul.fix.i32(i32 %x, i32 %y, i32 2)
+  ret i32 %tmp
+}
+
+define i64 @func2(i64 %x, i64 %y) {
+; ARM-LABEL: func2:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    .save {r4, r5, r6, r7, lr}
+; ARM-NEXT:    push {r4, r5, r6, r7, lr}
+; ARM-NEXT:    .pad #28
+; ARM-NEXT:    sub sp, #28
+; ARM-NEXT:    str r3, [sp, #8] @ 4-byte Spill
+; ARM-NEXT:    mov r5, r2
+; ARM-NEXT:    str r2, [sp, #12] @ 4-byte Spill
+; ARM-NEXT:    mov r7, r1
+; ARM-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; ARM-NEXT:    movs r6, #0
+; ARM-NEXT:    mov r4, r0
+; ARM-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; ARM-NEXT:    mov r1, r6
+; ARM-NEXT:    mov r3, r6
+; ARM-NEXT:    bl __aeabi_lmul
+; ARM-NEXT:    str r0, [sp, #24] @ 4-byte Spill
+; ARM-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; ARM-NEXT:    mov r0, r7
+; ARM-NEXT:    mov r1, r6
+; ARM-NEXT:    mov r2, r5
+; ARM-NEXT:    mov r3, r6
+; ARM-NEXT:    bl __aeabi_lmul
+; ARM-NEXT:    mov r5, r1
+; ARM-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; ARM-NEXT:    adds r0, r0, r1
+; ARM-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; ARM-NEXT:    adcs r5, r6
+; ARM-NEXT:    mov r0, r4
+; ARM-NEXT:    mov r1, r6
+; ARM-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
+; ARM-NEXT:    mov r2, r7
+; ARM-NEXT:    mov r3, r6
+; ARM-NEXT:    bl __aeabi_lmul
+; ARM-NEXT:    mov r4, r1
+; ARM-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; ARM-NEXT:    adds r0, r0, r1
+; ARM-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; ARM-NEXT:    adcs r4, r5
+; ARM-NEXT:    ldr r5, [sp, #4] @ 4-byte Reload
+; ARM-NEXT:    mov r0, r5
+; ARM-NEXT:    mov r1, r6
+; ARM-NEXT:    mov r2, r7
+; ARM-NEXT:    mov r3, r6
+; ARM-NEXT:    bl __aeabi_lmul
+; ARM-NEXT:    adds r0, r0, r4
+; ARM-NEXT:    str r0, [sp] @ 4-byte Spill
+; ARM-NEXT:    asrs r2, r5, #31
+; ARM-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; ARM-NEXT:    mov r1, r7
+; ARM-NEXT:    mov r3, r2
+; ARM-NEXT:    bl __aeabi_lmul
+; ARM-NEXT:    mov r4, r0
+; ARM-NEXT:    asrs r0, r7, #31
+; ARM-NEXT:    mov r1, r0
+; ARM-NEXT:    ldr r2, [sp, #20] @ 4-byte Reload
+; ARM-NEXT:    mov r3, r5
+; ARM-NEXT:    bl __aeabi_lmul
+; ARM-NEXT:    adds r0, r0, r4
+; ARM-NEXT:    ldr r1, [sp] @ 4-byte Reload
+; ARM-NEXT:    adds r0, r1, r0
+; ARM-NEXT:    lsls r0, r0, #30
+; ARM-NEXT:    ldr r2, [sp, #16] @ 4-byte Reload
+; ARM-NEXT:    lsrs r1, r2, #2
+; ARM-NEXT:    adds r1, r0, r1
+; ARM-NEXT:    lsls r0, r2, #30
+; ARM-NEXT:    ldr r2, [sp, #24] @ 4-byte Reload
+; ARM-NEXT:    lsrs r2, r2, #2
+; ARM-NEXT:    adds r0, r0, r2
+; ARM-NEXT:    add sp, #28
+; ARM-NEXT:    pop {r4, r5, r6, r7, pc}
+  %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 2)
+  ret i64 %tmp
+}
+
+define i4 @func3(i4 %x, i4 %y) nounwind {
+; ARM-LABEL: func3:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    .save {r4, lr}
+; ARM-NEXT:    push {r4, lr}
+; ARM-NEXT:    lsls r2, r0, #28
+; ARM-NEXT:    asrs r0, r2, #28
+; ARM-NEXT:    asrs r4, r2, #31
+; ARM-NEXT:    lsls r1, r1, #28
+; ARM-NEXT:    asrs r2, r1, #28
+; ARM-NEXT:    asrs r3, r1, #31
+; ARM-NEXT:    mov r1, r4
+; ARM-NEXT:    bl __aeabi_lmul
+; ARM-NEXT:    lsrs r0, r0, #2
+; ARM-NEXT:    lsls r1, r1, #30
+; ARM-NEXT:    adds r0, r1, r0
+; ARM-NEXT:    pop {r4, pc}
+  %tmp = call i4 @llvm.smul.fix.i4(i4 %x, i4 %y, i32 2)
+  ret i4 %tmp
+}
+
+;; These result in regular integer multiplication
+define i32 @func4(i32 %x, i32 %y) nounwind {
+; ARM-LABEL: func4:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    muls r0, r1, r0
+; ARM-NEXT:    bx lr
+  %tmp = call i32 @llvm.smul.fix.i32(i32 %x, i32 %y, i32 0)
+  ret i32 %tmp
+}
+
+define i64 @func5(i64 %x, i64 %y) {
+; ARM-LABEL: func5:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    .save {r7, lr}
+; ARM-NEXT:    push {r7, lr}
+; ARM-NEXT:    bl __aeabi_lmul
+; ARM-NEXT:    pop {r7, pc}
+  %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 0)
+  ret i64 %tmp
+}
+
+define i4 @func6(i4 %x, i4 %y) nounwind {
+; ARM-LABEL: func6:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    lsls r1, r1, #28
+; ARM-NEXT:    asrs r1, r1, #28
+; ARM-NEXT:    lsls r0, r0, #28
+; ARM-NEXT:    asrs r0, r0, #28
+; ARM-NEXT:    muls r0, r1, r0
+; ARM-NEXT:    bx lr
+  %tmp = call i4 @llvm.smul.fix.i4(i4 %x, i4 %y, i32 0)
+  ret i4 %tmp
+}
+
+define i64 @func7(i64 %x, i64 %y) nounwind {
+; ARM-LABEL: func7:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    .save {r4, r5, r6, r7, lr}
+; ARM-NEXT:    push {r4, r5, r6, r7, lr}
+; ARM-NEXT:    .pad #20
+; ARM-NEXT:    sub sp, #20
+; ARM-NEXT:    str r3, [sp, #4] @ 4-byte Spill
+; ARM-NEXT:    mov r7, r2
+; ARM-NEXT:    str r2, [sp, #8] @ 4-byte Spill
+; ARM-NEXT:    mov r6, r1
+; ARM-NEXT:    str r1, [sp] @ 4-byte Spill
+; ARM-NEXT:    movs r5, #0
+; ARM-NEXT:    mov r4, r0
+; ARM-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; ARM-NEXT:    mov r1, r5
+; ARM-NEXT:    mov r3, r5
+; ARM-NEXT:    bl __aeabi_lmul
+; ARM-NEXT:    str r1, [sp, #12] @ 4-byte Spill
+; ARM-NEXT:    mov r0, r6
+; ARM-NEXT:    mov r1, r5
+; ARM-NEXT:    mov r2, r7
+; ARM-NEXT:    mov r3, r5
+; ARM-NEXT:    bl __aeabi_lmul
+; ARM-NEXT:    mov r7, r1
+; ARM-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; ARM-NEXT:    adds r0, r0, r1
+; ARM-NEXT:    str r0, [sp, #12] @ 4-byte Spill
+; ARM-NEXT:    adcs r7, r5
+; ARM-NEXT:    mov r0, r4
+; ARM-NEXT:    mov r1, r5
+; ARM-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
+; ARM-NEXT:    mov r2, r6
+; ARM-NEXT:    mov r3, r5
+; ARM-NEXT:    bl __aeabi_lmul
+; ARM-NEXT:    mov r4, r1
+; ARM-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; ARM-NEXT:    adds r0, r0, r1
+; ARM-NEXT:    str r0, [sp, #12] @ 4-byte Spill
+; ARM-NEXT:    adcs r4, r7
+; ARM-NEXT:    ldr r7, [sp] @ 4-byte Reload
+; ARM-NEXT:    mov r0, r7
+; ARM-NEXT:    mov r1, r5
+; ARM-NEXT:    mov r2, r6
+; ARM-NEXT:    mov r3, r5
+; ARM-NEXT:    bl __aeabi_lmul
+; ARM-NEXT:    adds r5, r0, r4
+; ARM-NEXT:    asrs r2, r7, #31
+; ARM-NEXT:    ldr r...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/79352