[llvm] r322361 - [ARM] Add codegen for SMMULR, SMMLAR and SMMLSR

Fri Jan 12 01:24:41 PST 2018

Author: avieira
Date: Fri Jan 12 01:24:41 2018
New Revision: 322361

URL: http://llvm.org/viewvc/llvm-project?rev=322361&view=rev
Log:
[ARM] Add codegen for SMMULR, SMMLAR and SMMLSR

This patch teaches the Arm back-end to generate the SMMULR, SMMLAR and SMMLSR
instructions from equivalent IR patterns.

Differential Revision: https://reviews.llvm.org/D41775

Added:
    llvm/trunk/test/CodeGen/ARM/dsp-mlal.ll
Modified:
    llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
    llvm/trunk/lib/Target/ARM/ARMISelLowering.h
    llvm/trunk/lib/Target/ARM/ARMInstrInfo.td
    llvm/trunk/lib/Target/ARM/ARMInstrThumb2.td

Modified: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp?rev=322361&r1=322360&r2=322361&view=diff
==============================================================================

--- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp Fri Jan 12 01:24:41 2018
@@ -1337,6 +1337,8 @@ const char *ARMTargetLowering::getTarget
   case ARMISD::SMLALDX:       return "ARMISD::SMLALDX";
   case ARMISD::SMLSLD:        return "ARMISD::SMLSLD";
   case ARMISD::SMLSLDX:       return "ARMISD::SMLSLDX";
+  case ARMISD::SMMLAR:        return "ARMISD::SMMLAR";
+  case ARMISD::SMMLSR:        return "ARMISD::SMMLSR";
   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
   case ARMISD::BFI:           return "ARMISD::BFI";
   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
@@ -9860,7 +9862,7 @@ static SDValue AddCombineTo64BitSMLAL16(
   return resNode;
 }
 
-static SDValue AddCombineTo64bitMLAL(SDNode *AddeNode,
+static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const ARMSubtarget *Subtarget) {
   // Look for multiply add opportunities.
@@ -9877,49 +9879,61 @@ static SDValue AddCombineTo64bitMLAL(SDN
   //                  V      V
   //                    ADDE   <- hiAdd
   //
-  assert(AddeNode->getOpcode() == ARMISD::ADDE && "Expect an ADDE");
+  // In the special case where only the higher part of a signed result is used
+  // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
+  // a constant with the exact value of 0x80000000, we recognize we are dealing
+  // with a "rounded multiply and add" (or subtract) and transform it into
+  // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
+
+  assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
+          AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
+         "Expect an ADDE or SUBE");
 
-  assert(AddeNode->getNumOperands() == 3 &&
-         AddeNode->getOperand(2).getValueType() == MVT::i32 &&
+  assert(AddeSubeNode->getNumOperands() == 3 &&
+         AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
          "ADDE node has the wrong inputs");
 
-  // Check that we are chained to the right ADDC node.
-  SDNode* AddcNode = AddeNode->getOperand(2).getNode();
-  if (AddcNode->getOpcode() != ARMISD::ADDC)
+  // Check that we are chained to the right ADDC or SUBC node.
+  SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
+  if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
+       AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
+      (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
+       AddcSubcNode->getOpcode() != ARMISD::SUBC))
     return SDValue();
 
-  SDValue AddcOp0 = AddcNode->getOperand(0);
-  SDValue AddcOp1 = AddcNode->getOperand(1);
+  SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
+  SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
 
   // Check if the two operands are from the same mul_lohi node.
-  if (AddcOp0.getNode() == AddcOp1.getNode())
+  if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
     return SDValue();
 
-  assert(AddcNode->getNumValues() == 2 &&
-         AddcNode->getValueType(0) == MVT::i32 &&
+  assert(AddcSubcNode->getNumValues() == 2 &&
+         AddcSubcNode->getValueType(0) == MVT::i32 &&
          "Expect ADDC with two result values. First: i32");
 
   // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
   // maybe a SMLAL which multiplies two 16-bit values.
-  if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
-      AddcOp0->getOpcode() != ISD::SMUL_LOHI &&
-      AddcOp1->getOpcode() != ISD::UMUL_LOHI &&
-      AddcOp1->getOpcode() != ISD::SMUL_LOHI)
-    return AddCombineTo64BitSMLAL16(AddcNode, AddeNode, DCI, Subtarget);
+  if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
+      AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
+      AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
+      AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
+      AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
+    return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
 
   // Check for the triangle shape.
-  SDValue AddeOp0 = AddeNode->getOperand(0);
-  SDValue AddeOp1 = AddeNode->getOperand(1);
+  SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
+  SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
 
-  // Make sure that the ADDE operands are not coming from the same node.
-  if (AddeOp0.getNode() == AddeOp1.getNode())
+  // Make sure that the ADDE/SUBE operands are not coming from the same node.
+  if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
     return SDValue();
 
-  // Find the MUL_LOHI node walking up ADDE's operands.
+  // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
   bool IsLeftOperandMUL = false;
-  SDValue MULOp = findMUL_LOHI(AddeOp0);
+  SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
   if (MULOp == SDValue())
-   MULOp = findMUL_LOHI(AddeOp1);
+    MULOp = findMUL_LOHI(AddeSubeOp1);
   else
     IsLeftOperandMUL = true;
   if (MULOp == SDValue())
@@ -9930,63 +9944,88 @@ static SDValue AddCombineTo64bitMLAL(SDN
   unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
 
   // Figure out the high and low input values to the MLAL node.
-  SDValue* HiAdd = nullptr;
-  SDValue* LoMul = nullptr;
-  SDValue* LowAdd = nullptr;
+  SDValue *HiAddSub = nullptr;
+  SDValue *LoMul = nullptr;
+  SDValue *LowAddSub = nullptr;
 
-  // Ensure that ADDE is from high result of ISD::xMUL_LOHI.
-  if ((AddeOp0 != MULOp.getValue(1)) && (AddeOp1 != MULOp.getValue(1)))
+  // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
+  if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
     return SDValue();
 
   if (IsLeftOperandMUL)
-    HiAdd = &AddeOp1;
+    HiAddSub = &AddeSubeOp1;
   else
-    HiAdd = &AddeOp0;
+    HiAddSub = &AddeSubeOp0;
 
+  // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
+  // whose low result is fed to the ADDC/SUBC we are checking.
 
-  // Ensure that LoMul and LowAdd are taken from correct ISD::SMUL_LOHI node
-  // whose low result is fed to the ADDC we are checking.
-
-  if (AddcOp0 == MULOp.getValue(0)) {
-    LoMul = &AddcOp0;
-    LowAdd = &AddcOp1;
-  }
-  if (AddcOp1 == MULOp.getValue(0)) {
-    LoMul = &AddcOp1;
-    LowAdd = &AddcOp0;
+  if (AddcSubcOp0 == MULOp.getValue(0)) {
+    LoMul = &AddcSubcOp0;
+    LowAddSub = &AddcSubcOp1;
+  }
+  if (AddcSubcOp1 == MULOp.getValue(0)) {
+    LoMul = &AddcSubcOp1;
+    LowAddSub = &AddcSubcOp0;
   }
 
   if (!LoMul)
     return SDValue();
 
-  // If HiAdd is the same node as ADDC or is a predecessor of ADDC the
-  // replacement below will create a cycle.
-  if (AddcNode == HiAdd->getNode() ||
-      AddcNode->isPredecessorOf(HiAdd->getNode()))
+  // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
+  // the replacement below will create a cycle.
+  if (AddcSubcNode == HiAddSub->getNode() ||
+      AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
     return SDValue();
 
   // Create the merged node.
   SelectionDAG &DAG = DCI.DAG;
 
-  // Build operand list.
+  // Start building operand list.
   SmallVector<SDValue, 8> Ops;
   Ops.push_back(LoMul->getOperand(0));
   Ops.push_back(LoMul->getOperand(1));
-  Ops.push_back(*LowAdd);
-  Ops.push_back(*HiAdd);
 
-  SDValue MLALNode =  DAG.getNode(FinalOpc, SDLoc(AddcNode),
+  // Check whether we can use SMMLAR, SMMLSR or SMMULR instead.  For this to be
+  // the case, we must be doing signed multiplication and only use the higher
+  // part of the result of the MLAL, furthermore the LowAddSub must be a constant
+  // addition or subtraction with the value of 0x800000.
+  if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
+      FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
+      LowAddSub->getNode()->getOpcode() == ISD::Constant &&
+      static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
+          0x80000000) {
+    Ops.push_back(*HiAddSub);
+    if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
+      FinalOpc = ARMISD::SMMLSR;
+    } else {
+      FinalOpc = ARMISD::SMMLAR;
+    }
+    SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
+
+    return SDValue(AddeSubeNode, 0);
+  } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
+    // SMMLS is generated during instruction selection and the rest of this
+    // function can not handle the case where AddcSubcNode is a SUBC.
+    return SDValue();
+
+  // Finish building the operand list for {U/S}MLAL
+  Ops.push_back(*LowAddSub);
+  Ops.push_back(*HiAddSub);
+
+  SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
                                  DAG.getVTList(MVT::i32, MVT::i32), Ops);
 
   // Replace the ADDs' nodes uses by the MLA node's values.
   SDValue HiMLALResult(MLALNode.getNode(), 1);
-  DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
 
   SDValue LoMLALResult(MLALNode.getNode(), 0);
-  DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
 
   // Return original node to notify the driver to stop replacing.
-  return SDValue(AddeNode, 0);
+  return SDValue(AddeSubeNode, 0);
 }
 
 static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode,
@@ -10098,9 +10137,11 @@ static SDValue PerformAddcSubcCombine(SD
   return SDValue();
 }
 
-static SDValue PerformAddeSubeCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue PerformAddeSubeCombine(SDNode *N,
+                                      TargetLowering::DAGCombinerInfo &DCI,
                                       const ARMSubtarget *Subtarget) {
   if (Subtarget->isThumb1Only()) {
+    SelectionDAG &DAG = DCI.DAG;
     SDValue RHS = N->getOperand(1);
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
       int64_t imm = C->getSExtValue();
@@ -10118,6 +10159,8 @@ static SDValue PerformAddeSubeCombine(SD
                            N->getOperand(0), RHS, N->getOperand(2));
       }
     }
+  } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
+    return AddCombineTo64bitMLAL(N, DCI, Subtarget);
   }
   return SDValue();
 }
@@ -10130,7 +10173,7 @@ static SDValue PerformADDECombine(SDNode
                                   const ARMSubtarget *Subtarget) {
   // Only ARM and Thumb2 support UMLAL/SMLAL.
   if (Subtarget->isThumb1Only())
-    return PerformAddeSubeCombine(N, DCI.DAG, Subtarget);
+    return PerformAddeSubeCombine(N, DCI, Subtarget);
 
   // Only perform the checks after legalize when the pattern is available.
   if (DCI.isBeforeLegalize()) return SDValue();
@@ -12338,7 +12381,7 @@ SDValue ARMTargetLowering::PerformDAGCom
   case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
   case ARMISD::ADDC:
   case ARMISD::SUBC:    return PerformAddcSubcCombine(N, DCI, Subtarget);
-  case ARMISD::SUBE:    return PerformAddeSubeCombine(N, DCI.DAG, Subtarget);
+  case ARMISD::SUBE:    return PerformAddeSubeCombine(N, DCI, Subtarget);
   case ARMISD::BFI:     return PerformBFICombine(N, DCI);
   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
   case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);

Modified: llvm/trunk/lib/Target/ARM/ARMISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMISelLowering.h?rev=322361&r1=322360&r2=322361&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMISelLowering.h (original)
+++ llvm/trunk/lib/Target/ARM/ARMISelLowering.h Fri Jan 12 01:24:41 2018
@@ -203,6 +203,8 @@ class VectorType;
       SMLALDX,      // Signed multiply accumulate long dual exchange
       SMLSLD,       // Signed multiply subtract long dual
       SMLSLDX,      // Signed multiply subtract long dual exchange
+      SMMLAR,       // Signed multiply long, round and add
+      SMMLSR,       // Signed multiply long, subtract and round
 
       // Operands of the standard BUILD_VECTOR node are not legalized, which
       // is fine if BUILD_VECTORs are always lowered to shuffles or other

Modified: llvm/trunk/lib/Target/ARM/ARMInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMInstrInfo.td?rev=322361&r1=322360&r2=322361&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMInstrInfo.td (original)
+++ llvm/trunk/lib/Target/ARM/ARMInstrInfo.td Fri Jan 12 01:24:41 2018
@@ -105,6 +105,14 @@ def ARMSmlaldx       : SDNode<"ARMISD::S
 def ARMSmlsld        : SDNode<"ARMISD::SMLSLD", SDT_LongMac>;
 def ARMSmlsldx       : SDNode<"ARMISD::SMLSLDX", SDT_LongMac>;
 
+def SDT_MulHSR       : SDTypeProfile<1, 3, [SDTCisVT<0,i32>,
+                                            SDTCisSameAs<0, 1>,
+                                            SDTCisSameAs<0, 2>,
+                                            SDTCisSameAs<0, 3>]>;
+
+def ARMsmmlar      : SDNode<"ARMISD::SMMLAR", SDT_MulHSR>;
+def ARMsmmlsr      : SDNode<"ARMISD::SMMLSR", SDT_MulHSR>;
+
 // Node definitions.
 def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
 def ARMWrapperPIC    : SDNode<"ARMISD::WrapperPIC",  SDTIntUnaryOp>;
@@ -4143,7 +4151,8 @@ def SMMUL : AMul2I <0b0111010, 0b0001, (
 }
 
 def SMMULR : AMul2I <0b0111010, 0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
-               IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm", []>,
+               IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm",
+               [(set GPR:$Rd, (ARMsmmlar GPR:$Rn, GPR:$Rm, (i32 0)))]>,
             Requires<[IsARM, HasV6]>,
              Sched<[WriteMUL32, ReadMUL, ReadMUL]>  {
   let Inst{15-12} = 0b1111;
@@ -4158,7 +4167,8 @@ def SMMLA : AMul2Ia <0b0111010, 0b0001,
 
 def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
-               IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>,
+               IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra",
+               [(set GPR:$Rd, (ARMsmmlar GPR:$Rn, GPR:$Rm, GPR:$Ra))]>,
             Requires<[IsARM, HasV6]>,
              Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
@@ -4170,7 +4180,8 @@ def SMMLS : AMul2Ia <0b0111010, 0b1101,
 
 def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
-               IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>,
+               IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra",
+               [(set GPR:$Rd, (ARMsmmlsr GPR:$Rn, GPR:$Rm, GPR:$Ra))]>,
             Requires<[IsARM, HasV6]>,
              Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 

Modified: llvm/trunk/lib/Target/ARM/ARMInstrThumb2.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMInstrThumb2.td?rev=322361&r1=322360&r2=322361&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMInstrThumb2.td (original)
+++ llvm/trunk/lib/Target/ARM/ARMInstrThumb2.td Fri Jan 12 01:24:41 2018
@@ -2661,7 +2661,9 @@ class T2SMMUL<bits<4> op7_4, string opc,
 }
 def t2SMMUL : T2SMMUL<0b0000, "smmul", [(set rGPR:$Rd, (mulhs rGPR:$Rn,
                                                               rGPR:$Rm))]>;
-def t2SMMULR : T2SMMUL<0b0001, "smmulr", []>;
+def t2SMMULR :
+  T2SMMUL<0b0001, "smmulr",
+          [(set rGPR:$Rd, (ARMsmmlar rGPR:$Rn, rGPR:$Rm, (i32 0)))]>;
 
 class T2FourRegSMMLA<bits<3> op22_20, bits<4> op7_4, string opc,
                      list<dag> pattern>
@@ -2677,9 +2679,11 @@ class T2FourRegSMMLA<bits<3> op22_20, bi
 
 def t2SMMLA :   T2FourRegSMMLA<0b101, 0b0000, "smmla",
                 [(set rGPR:$Rd, (add (mulhs rGPR:$Rm, rGPR:$Rn), rGPR:$Ra))]>;
-def t2SMMLAR:   T2FourRegSMMLA<0b101, 0b0001, "smmlar", []>;
+def t2SMMLAR:   T2FourRegSMMLA<0b101, 0b0001, "smmlar",
+                [(set rGPR:$Rd, (ARMsmmlar rGPR:$Rn, rGPR:$Rm, rGPR:$Ra))]>;
 def t2SMMLS:    T2FourRegSMMLA<0b110, 0b0000, "smmls", []>;
-def t2SMMLSR:   T2FourRegSMMLA<0b110, 0b0001, "smmlsr", []>;
+def t2SMMLSR:   T2FourRegSMMLA<0b110, 0b0001, "smmlsr",
+                [(set rGPR:$Rd, (ARMsmmlsr rGPR:$Rn, rGPR:$Rm, rGPR:$Ra))]>;
 
 class T2ThreeRegSMUL<bits<3> op22_20, bits<2> op5_4, string opc,
                      list<dag> pattern>

Added: llvm/trunk/test/CodeGen/ARM/dsp-mlal.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/dsp-mlal.ll?rev=322361&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/dsp-mlal.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/dsp-mlal.ll Fri Jan 12 01:24:41 2018
@@ -0,0 +1,171 @@
+; RUN: llc -mtriple=thumbv7m -mattr=+dsp %s -o - | FileCheck %s
+; RUN: llc -mtriple=armv7a %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv7m -mattr=-dsp %s -o - | FileCheck --check-prefix=NODSP %s
+
+define hidden i32 @SMMULR_SMMLAR(i32 %a, i32 %b0, i32 %b1, i32 %Xn, i32 %Xn1) local_unnamed_addr {
+entry:
+; CHECK-LABEL: SMMULR_SMMLAR:
+; CHECK: ldr r0, [sp]
+; CHECK-NEXT: smmulr r0, {{(r0, r2|r2, r0)}}
+; CHECK-NEXT: smmlar r0, {{(r1, r3|r3, r1)}}, r0
+; NODSP-LABEL: SMMULR_SMMLAR:
+; NODSP-NOT: smmulr
+; NODSP-NOT: smmlar
+  %conv = sext i32 %b1 to i64
+  %conv1 = sext i32 %Xn1 to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %add = add nsw i64 %mul, 2147483648
+  %0 = and i64 %add, -4294967296
+  %conv4 = sext i32 %b0 to i64
+  %conv5 = sext i32 %Xn to i64
+  %mul6 = mul nsw i64 %conv5, %conv4
+  %add7 = add i64 %mul6, 2147483648
+  %add8 = add i64 %add7, %0
+  %1 = lshr i64 %add8, 32
+  %conv10 = trunc i64 %1 to i32
+  ret i32 %conv10
+}
+
+define hidden i32 @SMMULR(i32 %a, i32 %b) local_unnamed_addr {
+entry:
+; CHECK-LABEL: SMMULR:
+; CHECK: smmulr r0, {{(r0, r1|r1, r0)}}
+; NODSP-LABEL: SMMULR:
+; NODSP-NOT: smmulr
+  %conv = sext i32 %a to i64
+  %conv1 = sext i32 %b to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %add = add nsw i64 %mul, 2147483648
+  %0 = lshr i64 %add, 32
+  %conv2 = trunc i64 %0 to i32
+  ret i32 %conv2
+}
+
+define hidden i32 @SMMUL(i32 %a, i32 %b) local_unnamed_addr {
+entry:
+; CHECK-LABEL: SMMUL:
+; CHECK: smmul r0, {{(r0, r1|r1, r0)}}
+; NODSP-LABEL: SMMUL:
+; NODSP-NOT: smmul
+  %conv = sext i32 %a to i64
+  %conv1 = sext i32 %b to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %0 = lshr i64 %mul, 32
+  %conv2 = trunc i64 %0 to i32
+  ret i32 %conv2
+}
+
+define hidden i32 @SMMLSR(i32 %a, i32 %b, i32 %c) local_unnamed_addr {
+entry:
+; CHECK-LABEL: SMMLSR:
+; CHECK: smmlsr r0, {{(r1, r2|r2, r1)}}, r0
+; NODSP-LABEL: SMMLSR:
+; NODSP-NOT: smmlsr
+  %conv6 = zext i32 %a to i64
+  %shl = shl nuw i64 %conv6, 32
+  %conv1 = sext i32 %b to i64
+  %conv2 = sext i32 %c to i64
+  %mul = mul nsw i64 %conv2, %conv1
+  %sub = or i64 %shl, 2147483648
+  %add = sub i64 %sub, %mul
+  %0 = lshr i64 %add, 32
+  %conv3 = trunc i64 %0 to i32
+  ret i32 %conv3
+}
+
+define hidden i32 @NOT_SMMLSR(i32 %a, i32 %b, i32 %c) local_unnamed_addr {
+entry:
+; CHECK-LABEL: NOT_SMMLSR:
+; CHECK-NOT: smmlsr
+; NODSP-LABEL: NOT_SMMLSR:
+; NODSP-NOT: smmlsr
+  %conv = sext i32 %b to i64
+  %conv1 = sext i32 %c to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %add = add nsw i64 %mul, 2147483648
+  %0 = lshr i64 %add, 32
+  %conv2 = trunc i64 %0 to i32
+  %sub = sub nsw i32 %a, %conv2
+  ret i32 %sub
+}
+
+define hidden i32 @SMMLS(i32 %a, i32 %b, i32 %c) local_unnamed_addr {
+entry:
+; CHECK-LABEL: SMMLS:
+; CHECK: smmls r0, {{(r1, r2|r2, r1)}}, r0
+; NODSP-LABEL: SMMLS:
+; NODSP-NOT: smmls
+  %conv5 = zext i32 %a to i64
+  %shl = shl nuw i64 %conv5, 32
+  %conv1 = sext i32 %b to i64
+  %conv2 = sext i32 %c to i64
+  %mul = mul nsw i64 %conv2, %conv1
+  %sub = sub nsw i64 %shl, %mul
+  %0 = lshr i64 %sub, 32
+  %conv3 = trunc i64 %0 to i32
+  ret i32 %conv3
+}
+
+define hidden i32 @NOT_SMMLS(i32 %a, i32 %b, i32 %c) local_unnamed_addr {
+entry:
+; CHECK-LABEL: NOT_SMMLS:
+; CHECK-NOT: smmls
+; NODSP-LABEL: NOT_SMMLS:
+; NODSP-NOT: smmls
+  %conv = sext i32 %b to i64
+  %conv1 = sext i32 %c to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %0 = lshr i64 %mul, 32
+  %conv2 = trunc i64 %0 to i32
+  %sub = sub nsw i32 %a, %conv2
+  ret i32 %sub
+}
+
+define hidden i32 @SMMLA(i32 %a, i32 %b, i32 %c) local_unnamed_addr {
+entry:
+; CHECK-LABEL: SMMLA:
+; CHECK: smmla r0, {{(r1, r2|r2, r1)}}, r0
+; NODSP-LABEL: SMMLA:
+; NODSP-NOT: smmla
+  %conv = sext i32 %b to i64
+  %conv1 = sext i32 %c to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %0 = lshr i64 %mul, 32
+  %conv2 = trunc i64 %0 to i32
+  %add = add nsw i32 %conv2, %a
+  ret i32 %add
+}
+
+define hidden i32 @SMMLAR(i32 %a, i32 %b, i32 %c) local_unnamed_addr {
+entry:
+; CHECK-LABEL: SMMLAR:
+; CHECK: smmlar r0, {{(r1, r2|r2, r1)}}, r0
+; NODSP-LABEL: SMMLAR:
+; NODSP-NOT: smmlar
+  %conv7 = zext i32 %a to i64
+  %shl = shl nuw i64 %conv7, 32
+  %conv1 = sext i32 %b to i64
+  %conv2 = sext i32 %c to i64
+  %mul = mul nsw i64 %conv2, %conv1
+  %add = or i64 %shl, 2147483648
+  %add3 = add i64 %add, %mul
+  %0 = lshr i64 %add3, 32
+  %conv4 = trunc i64 %0 to i32
+  ret i32 %conv4
+}
+
+define hidden i32 @NOT_SMMLA(i32 %a, i32 %b, i32 %c) local_unnamed_addr {
+entry:
+; CHECK-LABEL: NOT_SMMLA:
+; CHECK-NOT: smmla
+; NODSP-LABEL: NOT_SMMLA:
+; NODSP-NOT: smmla
+  %conv = sext i32 %b to i64
+  %conv1 = sext i32 %c to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %0 = lshr i64 %mul, 32
+  %conv2 = trunc i64 %0 to i32
+  %add = xor i32 %conv2, -2147483648
+  %add3 = add i32 %add, %a
+  ret i32 %add3
+}