[llvm] [AArch64] Optimize vector multiplications by certain constants for v2i64 (PR #183827)

Wed Mar 25 13:56:37 PDT 2026

https://github.com/AlyElashram updated https://github.com/llvm/llvm-project/pull/183827

>From 825a454ad3788d17831361fc5e02d51181186355 Mon Sep 17 00:00:00 2001
From: AlyElashram <alyahelashram at gmail.com>
Date: Fri, 27 Feb 2026 22:21:34 +0200
Subject: [PATCH 1/9] Optimize the mul to a shl add/sub for cases +-1 of a
 power 2 number

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 42 ++++++++++++++++
 llvm/test/CodeGen/AArch64/neon-mul-shl.ll     | 48 +++++++++++++++++++
 2 files changed, 90 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/neon-mul-shl.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index eb6e9146e3839..0b64a201efde5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -20434,6 +20434,44 @@ static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+// Transform mul<v2i64, splat(2^n +-1)> into a SHL and ADD/SUB
+// this transormation is much faster when vector mul is not supported
+static SDValue convertMulToShlAdd(SDNode *N, SelectionDAG &DAG) {
+  const SDNode *Operand = N->getOperand(1).getNode();
+  APInt SplatValue;
+  ISD::isConstantSplatVector(Operand, SplatValue);
+
+  // Not a constant splat so should just stay as a mulitplcation operation
+  if (!SplatValue.getBoolValue())
+    return SDValue();
+
+  // If (Value - 1) is a power of 2, we need an ADD (e.g., 257)
+  bool NeedsAdd = (SplatValue - 1).isPowerOf2();
+  bool NeedsSub = (SplatValue + 1).isPowerOf2();
+
+  // If the constant is not (2^n + 1) or (2^n - 1), it would require
+  // more than one addition/subtraction. For v2i64, the cost of
+  // multiple vector adds/shifts often exceeds the cost of
+  // scalarization (moving to GPRs to use a single MUL).
+  if (!NeedsSub && !NeedsAdd)
+    return SDValue();
+
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+
+  unsigned ShiftAmt =
+      NeedsAdd ? (SplatValue - 1).logBase2() : (SplatValue + 1).logBase2();
+  SDValue VecShiftAmt = DAG.getConstant(ShiftAmt, DL, VT);
+  SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, VT, LHS, VecShiftAmt);
+
+  // Emit: (LHS << ShiftAmt) +- LHS
+  if (NeedsAdd) {
+    return DAG.getNode(ISD::ADD, DL, VT, ShiftNode, LHS);
+  }
+  return DAG.getNode(ISD::SUB, DL, VT, ShiftNode, LHS);
+}
+
 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const AArch64Subtarget *Subtarget) {
@@ -20444,6 +20482,10 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
     return Ext;
   if (SDValue Ext = performVectorExtCombine(N, DAG))
     return Ext;
+  if(Subtarget->isNeonAvailable()) {
+    if (SDValue Ext = convertMulToShlAdd(N, DAG))
+        return Ext;
+  }
 
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
diff --git a/llvm/test/CodeGen/AArch64/neon-mul-shl.ll b/llvm/test/CodeGen/AArch64/neon-mul-shl.ll
new file mode 100644
index 0000000000000..69890260b840e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-mul-shl.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64 -mattr=+neon | FileCheck %s
+
+define <2 x i64> @mul_v2i64_257(<2 x i64> %v) {
+; CHECK-LABEL: mul_v2i64_257:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v1.2d, v0.2d, #8
+; CHECK-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    ret
+  %mul = mul <2 x i64> %v, splat (i64 257)
+  ret <2 x i64> %mul
+}
+
+define <2 x i64> @mul_v2i64_255(<2 x i64> %v) {
+; CHECK-LABEL: mul_v2i64_255:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v1.2d, v0.2d, #8
+; CHECK-NEXT:    sub v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    ret
+  %mul = mul <2 x i64> %v, splat (i64 255)
+  ret <2 x i64> %mul
+}
+
+define <2 x i64> @mul_v2i64_9(<2 x i64> %v) {
+; CHECK-LABEL: mul_v2i64_9:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v1.2d, v0.2d, #3
+; CHECK-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    ret
+  %mul = mul <2 x i64> %v, splat (i64 9)
+  ret <2 x i64> %mul
+}
+
+;; This should not be optimized
+define <2 x i64> @mul_v2i64_13_no_opt(<2 x i64> %v) {
+; CHECK-LABEL: mul_v2i64_13_no_opt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov x10, d0
+; CHECK-NEXT:    mov w8, #13 // =0xd
+; CHECK-NEXT:    mov x9, v0.d[1]
+; CHECK-NEXT:    mul x10, x10, x8
+; CHECK-NEXT:    mul x8, x9, x8
+; CHECK-NEXT:    fmov d0, x10
+; CHECK-NEXT:    mov v0.d[1], x8
+; CHECK-NEXT:    ret
+  %mul = mul <2 x i64> %v, splat (i64 13)
+  ret <2 x i64> %mul
+}

>From 11940e41cd33e55ba8200bb6d53db9d34ffe4fa4 Mon Sep 17 00:00:00 2001
From: AlyElashram <alyahelashram at gmail.com>
Date: Fri, 27 Feb 2026 22:23:39 +0200
Subject: [PATCH 2/9] Add missed vector size check

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0b64a201efde5..fe18c853138b0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -20482,7 +20482,7 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
     return Ext;
   if (SDValue Ext = performVectorExtCombine(N, DAG))
     return Ext;
-  if(Subtarget->isNeonAvailable()) {
+  if(Subtarget->isNeonAvailable() && N->getValueType(0) == MVT::v2i64) {
     if (SDValue Ext = convertMulToShlAdd(N, DAG))
         return Ext;
   }

>From 8fa911c96875a1c1464a54924aaa3eeb69f7dd03 Mon Sep 17 00:00:00 2001
From: AlyElashram <alyahelashram at gmail.com>
Date: Fri, 27 Feb 2026 22:28:58 +0200
Subject: [PATCH 3/9] Formatting

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index fe18c853138b0..a47db2a010892 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -20482,7 +20482,7 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
     return Ext;
   if (SDValue Ext = performVectorExtCombine(N, DAG))
     return Ext;
-  if(Subtarget->isNeonAvailable() && N->getValueType(0) == MVT::v2i64) {
+  if (Subtarget->isNeonAvailable() && N->getValueType(0) == MVT::v2i64) {
     if (SDValue Ext = convertMulToShlAdd(N, DAG))
         return Ext;
   }

>From 9afa4be127dee034a10e7ce6156766230b119f48 Mon Sep 17 00:00:00 2001
From: AlyElashram <alyahelashram at gmail.com>
Date: Thu, 5 Mar 2026 20:02:44 +0200
Subject: [PATCH 4/9] Update the optimization so that it happens after the
 s/umull selection

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 84 +++++++++----------
 1 file changed, 40 insertions(+), 44 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a47db2a010892..c56c9c7aacc1b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5919,6 +5919,44 @@ static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
   return 0;
 }
 
+// Transform mul<v2i64, splat(2^n +-1)> into a SHL and ADD/SUB
+// this transormation is much faster when vector mul is not supported
+static SDValue convertMulToShlAdd(SDNode *N, SelectionDAG &DAG) {
+  const SDNode *Operand = N->getOperand(1).getNode();
+  APInt SplatValue;
+  ISD::isConstantSplatVector(Operand, SplatValue);
+
+  // Not a constant splat so should just stay as a mulitplcation operation
+  if (!SplatValue.getBoolValue())
+    return SDValue();
+
+  // If (Value - 1) is a power of 2, we need an ADD (e.g., 257)
+  bool NeedsAdd = (SplatValue - 1).isPowerOf2();
+  bool NeedsSub = (SplatValue + 1).isPowerOf2();
+
+  // If the constant is not (2^n + 1) or (2^n - 1), it would require
+  // more than one addition/subtraction. For v2i64, the cost of
+  // multiple vector adds/shifts often exceeds the cost of
+  // scalarization (moving to GPRs to use a single MUL).
+  if (!NeedsSub && !NeedsAdd)
+    return SDValue();
+
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+
+  unsigned ShiftAmt =
+      NeedsAdd ? (SplatValue - 1).logBase2() : (SplatValue + 1).logBase2();
+  SDValue VecShiftAmt = DAG.getConstant(ShiftAmt, DL, VT);
+  SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, VT, LHS, VecShiftAmt);
+
+  // Emit: (LHS << ShiftAmt) +- LHS
+  if (NeedsAdd) {
+    return DAG.getNode(ISD::ADD, DL, VT, ShiftNode, LHS);
+  }
+  return DAG.getNode(ISD::SUB, DL, VT, ShiftNode, LHS);
+}
+
 SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
 
@@ -5965,7 +6003,8 @@ SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
       // legal.
       if (Subtarget->hasSVE())
         return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
-      // Fall through to expand this.  It is not legal.
+      if (SDValue ShlAdd = convertMulToShlAdd(Op.getNode(), DAG))
+        return ShlAdd;
       return SDValue();
     } else
       // Other vector multiplications are legal.
@@ -20434,44 +20473,6 @@ static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-// Transform mul<v2i64, splat(2^n +-1)> into a SHL and ADD/SUB
-// this transormation is much faster when vector mul is not supported
-static SDValue convertMulToShlAdd(SDNode *N, SelectionDAG &DAG) {
-  const SDNode *Operand = N->getOperand(1).getNode();
-  APInt SplatValue;
-  ISD::isConstantSplatVector(Operand, SplatValue);
-
-  // Not a constant splat so should just stay as a mulitplcation operation
-  if (!SplatValue.getBoolValue())
-    return SDValue();
-
-  // If (Value - 1) is a power of 2, we need an ADD (e.g., 257)
-  bool NeedsAdd = (SplatValue - 1).isPowerOf2();
-  bool NeedsSub = (SplatValue + 1).isPowerOf2();
-
-  // If the constant is not (2^n + 1) or (2^n - 1), it would require
-  // more than one addition/subtraction. For v2i64, the cost of
-  // multiple vector adds/shifts often exceeds the cost of
-  // scalarization (moving to GPRs to use a single MUL).
-  if (!NeedsSub && !NeedsAdd)
-    return SDValue();
-
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-  SDValue LHS = N->getOperand(0);
-
-  unsigned ShiftAmt =
-      NeedsAdd ? (SplatValue - 1).logBase2() : (SplatValue + 1).logBase2();
-  SDValue VecShiftAmt = DAG.getConstant(ShiftAmt, DL, VT);
-  SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, VT, LHS, VecShiftAmt);
-
-  // Emit: (LHS << ShiftAmt) +- LHS
-  if (NeedsAdd) {
-    return DAG.getNode(ISD::ADD, DL, VT, ShiftNode, LHS);
-  }
-  return DAG.getNode(ISD::SUB, DL, VT, ShiftNode, LHS);
-}
-
 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const AArch64Subtarget *Subtarget) {
@@ -20482,11 +20483,6 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
     return Ext;
   if (SDValue Ext = performVectorExtCombine(N, DAG))
     return Ext;
-  if (Subtarget->isNeonAvailable() && N->getValueType(0) == MVT::v2i64) {
-    if (SDValue Ext = convertMulToShlAdd(N, DAG))
-        return Ext;
-  }
-
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 

>From 46a09cb7ae6881bf8bdedbd2bcc756b6696dd6ca Mon Sep 17 00:00:00 2001
From: AlyElashram <alyahelashram at gmail.com>
Date: Sat, 7 Mar 2026 00:59:26 +0200
Subject: [PATCH 5/9] Address PR Comments

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c56c9c7aacc1b..f1c8ad24826d2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5920,14 +5920,14 @@ static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
 }
 
 // Transform mul<v2i64, splat(2^n +-1)> into a SHL and ADD/SUB
-// this transormation is much faster when vector mul is not supported
+// this transformation is much faster when vector mul is not supported
 static SDValue convertMulToShlAdd(SDNode *N, SelectionDAG &DAG) {
   const SDNode *Operand = N->getOperand(1).getNode();
   APInt SplatValue;
-  ISD::isConstantSplatVector(Operand, SplatValue);
 
-  // Not a constant splat so should just stay as a mulitplcation operation
-  if (!SplatValue.getBoolValue())
+  // Not a constant splat so should just stay as a multiplication operation
+  if (!ISD::isConstantSplatVector(Operand, SplatValue) ||
+      !SplatValue.getBoolValue())
     return SDValue();
 
   // If (Value - 1) is a power of 2, we need an ADD (e.g., 257)
@@ -5951,10 +5951,7 @@ static SDValue convertMulToShlAdd(SDNode *N, SelectionDAG &DAG) {
   SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, VT, LHS, VecShiftAmt);
 
   // Emit: (LHS << ShiftAmt) +- LHS
-  if (NeedsAdd) {
-    return DAG.getNode(ISD::ADD, DL, VT, ShiftNode, LHS);
-  }
-  return DAG.getNode(ISD::SUB, DL, VT, ShiftNode, LHS);
+  return DAG.getNode(NeedsAdd ? ISD::ADD : ISD::SUB, DL, VT, ShiftNode, LHS);
 }
 
 SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
@@ -6003,8 +6000,10 @@ SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
       // legal.
       if (Subtarget->hasSVE())
         return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
+      // Try to optimize the mul to a shift left and add instead of scalarizing.
       if (SDValue ShlAdd = convertMulToShlAdd(Op.getNode(), DAG))
         return ShlAdd;
+      // Fall through to expanding as the mul is not legal.
       return SDValue();
     } else
       // Other vector multiplications are legal.

>From 01505b3614e75844376da3f032e533bf4cd154dc Mon Sep 17 00:00:00 2001
From: AlyElashram <alyahelashram at gmail.com>
Date: Sun, 15 Mar 2026 20:01:53 +0200
Subject: [PATCH 6/9] Add the 2 SHL optimization if the number is splittable
 into two whole roots LogBase2

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 72 ++++++++++++++-----
 1 file changed, 53 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f1c8ad24826d2..0a41e4246745c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5919,8 +5919,21 @@ static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
   return 0;
 }
 
-// Transform mul<v2i64, splat(2^n +-1)> into a SHL and ADD/SUB
-// this transformation is much faster when vector mul is not supported
+// Transform mul<v2i64, splat(const)> into a SHL and ADD/SUB
+// (1) multiply-by-(power-of-2 +/- 1) into shift and add/sub.
+// mul x, (2^N + 1) --> add (shl x, N), x
+// mul x, (2^N - 1) --> sub (shl x, N), x
+// Examples: x * 33 --> (x << 5) + x
+//           x * 15 --> (x << 4) - x
+//           x * -33 --> -((x << 5) + x)
+//           x * -15 --> -((x << 4) - x) ; this reduces --> x - (x << 4)
+// (2) multiply-by-(power-of-2 +/- power-of-2) into shifts and add/sub.
+// mul x, (2^N + 2^M) --> (add (shl x, N), (shl x, M))
+// mul x, (2^N - 2^M) --> (sub (shl x, N), (shl x, M))
+// Examples: x * 0x8800 --> (x << 15) + (x << 11)
+//           x * 0xf800 --> (x << 16) - (x << 11)
+//           x * -0x8800 --> -((x << 15) + (x << 11))
+//           x * -0xf800 --> -((x << 16) - (x << 11)) ; (x << 11) - (x << 16)
 static SDValue convertMulToShlAdd(SDNode *N, SelectionDAG &DAG) {
   const SDNode *Operand = N->getOperand(1).getNode();
   APInt SplatValue;
@@ -5930,28 +5943,49 @@ static SDValue convertMulToShlAdd(SDNode *N, SelectionDAG &DAG) {
       !SplatValue.getBoolValue())
     return SDValue();
 
-  // If (Value - 1) is a power of 2, we need an ADD (e.g., 257)
-  bool NeedsAdd = (SplatValue - 1).isPowerOf2();
-  bool NeedsSub = (SplatValue + 1).isPowerOf2();
+  bool IsNegative = SplatValue.isNegative();
+  SplatValue = SplatValue.abs();
+  // Placeholder for MathOp
+  unsigned MathOp = ISD::DELETED_NODE;
+  // The constant `2` should be treated as (2^0 + 1).
+  unsigned TZeros = SplatValue == 2 ? 0 : SplatValue.countr_zero();
+
+  // Shift the splat value by all the zeros , this won't affect the parrity
+  // this will help us find the first and second multiple to use.
+  SplatValue.lshrInPlace(TZeros);
+
+  if ((SplatValue - 1).isPowerOf2())
+    MathOp = ISD::ADD;
+  else if ((SplatValue + 1).isPowerOf2())
+    MathOp = ISD::SUB;
 
   // If the constant is not (2^n + 1) or (2^n - 1), it would require
   // more than one addition/subtraction. For v2i64, the cost of
   // multiple vector adds/shifts often exceeds the cost of
   // scalarization (moving to GPRs to use a single MUL).
-  if (!NeedsSub && !NeedsAdd)
-    return SDValue();
-
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-  SDValue LHS = N->getOperand(0);
-
-  unsigned ShiftAmt =
-      NeedsAdd ? (SplatValue - 1).logBase2() : (SplatValue + 1).logBase2();
-  SDValue VecShiftAmt = DAG.getConstant(ShiftAmt, DL, VT);
-  SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, VT, LHS, VecShiftAmt);
-
-  // Emit: (LHS << ShiftAmt) +- LHS
-  return DAG.getNode(NeedsAdd ? ISD::ADD : ISD::SUB, DL, VT, ShiftNode, LHS);
+  if (MathOp != ISD::DELETED_NODE) {
+    SDLoc DL(N);
+    EVT VT = N->getValueType(0);
+    SDValue LHS = N->getOperand(0);
+
+    unsigned ShiftAmt = MathOp == ISD::ADD ? (SplatValue - 1).logBase2()
+                                           : (SplatValue + 1).logBase2();
+    ShiftAmt += TZeros;
+
+    // assert(ShiftAmt < BitWidth &&
+    //        "multiply-by-constant generated out of bounds shift");
+    SDValue Shl =
+        DAG.getNode(ISD::SHL, DL, VT, LHS, DAG.getConstant(ShiftAmt, DL, VT));
+    SDValue Combined =
+        TZeros ? DAG.getNode(MathOp, DL, VT, Shl,
+                             DAG.getNode(ISD::SHL, DL, VT, LHS,
+                                         DAG.getConstant(TZeros, DL, VT)))
+               : DAG.getNode(MathOp, DL, VT, Shl, LHS);
+    if (IsNegative)
+      Combined = DAG.getNegative(Combined, DL, VT);
+    return Combined;
+  }
+  return SDValue();
 }
 
 SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {

>From 5ba22f58408841d240180994c6c49031ff7d4af4 Mon Sep 17 00:00:00 2001
From: AlyElashram <alyahelashram at gmail.com>
Date: Sun, 15 Mar 2026 20:08:15 +0200
Subject: [PATCH 7/9] Amend the tests to cover the new cases as well as the
 negatives.

---
 llvm/test/CodeGen/AArch64/neon-mul-shl.ll | 52 +++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/neon-mul-shl.ll b/llvm/test/CodeGen/AArch64/neon-mul-shl.ll
index 69890260b840e..95d1da84ff5d4 100644
--- a/llvm/test/CodeGen/AArch64/neon-mul-shl.ll
+++ b/llvm/test/CodeGen/AArch64/neon-mul-shl.ll
@@ -31,6 +31,58 @@ define <2 x i64> @mul_v2i64_9(<2 x i64> %v) {
   ret <2 x i64> %mul
 }
 
+define <2 x i64> @mul_v2i64_neg_33(<2 x i64> %v) {
+; CHECK-LABEL: mul_v2i64_neg_33:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v1.2d, v0.2d, #5
+; CHECK-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    neg v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %mul = mul <2 x i64> %v, splat (i64 -33)
+  ret <2 x i64> %mul
+}
+
+define <2 x i64> @mul_v2i64_neg_15(<2 x i64> %v) {
+; CHECK-LABEL: mul_v2i64_neg_15:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v1.2d, v0.2d, #4
+; CHECK-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+  %mul = mul <2 x i64> %v, splat (i64 -15)
+  ret <2 x i64> %mul
+}
+define <2 x i64> @mul_v2i64_8800(<2 x i64> %v) {
+; CHECK-LABEL: mul_v2i64_8800:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v1.2d, v0.2d, #11
+; CHECK-NEXT:    shl v0.2d, v0.2d, #15
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+  %mul = mul <2 x i64> %v, splat (i64 34816) ; 0x8800
+  ret <2 x i64> %mul
+}
+
+define <2 x i64> @mul_v2i64_f800(<2 x i64> %v) {
+; CHECK-LABEL: mul_v2i64_f800:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v1.2d, v0.2d, #11
+; CHECK-NEXT:    shl v0.2d, v0.2d, #16
+; CHECK-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+  %mul = mul <2 x i64> %v, splat (i64 63488) ; 0xf800
+  ret <2 x i64> %mul
+}
+
+;; This should not be optimized as it doesn't lower to a mul
+define <2 x i64> @mul_v2i64_256(<2 x i64> %v) {
+; CHECK-LABEL: mul_v2i64_256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.2d, v0.2d, #8
+; CHECK-NEXT:    ret
+  %mul = mul <2 x i64> %v, splat (i64 256)
+  ret <2 x i64> %mul
+}
+
 ;; This should not be optimized
 define <2 x i64> @mul_v2i64_13_no_opt(<2 x i64> %v) {
 ; CHECK-LABEL: mul_v2i64_13_no_opt:

>From deaaad3377dec55d5f1f18efcfdebf6bab0d90fa Mon Sep 17 00:00:00 2001
From: AlyElashram <alyahelashram at gmail.com>
Date: Tue, 17 Mar 2026 02:25:17 +0200
Subject: [PATCH 8/9] Address PR Comments

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0a41e4246745c..53eca84591ed6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5947,10 +5947,9 @@ static SDValue convertMulToShlAdd(SDNode *N, SelectionDAG &DAG) {
   SplatValue = SplatValue.abs();
   // Placeholder for MathOp
   unsigned MathOp = ISD::DELETED_NODE;
-  // The constant `2` should be treated as (2^0 + 1).
-  unsigned TZeros = SplatValue == 2 ? 0 : SplatValue.countr_zero();
+  unsigned TZeros = SplatValue.countr_zero();
 
-  // Shift the splat value by all the zeros , this won't affect the parrity
+  // Shift the splat value by all the zeros, this won't affect the parrity
   // this will help us find the first and second multiple to use.
   SplatValue.lshrInPlace(TZeros);
 
@@ -5972,15 +5971,14 @@ static SDValue convertMulToShlAdd(SDNode *N, SelectionDAG &DAG) {
                                            : (SplatValue + 1).logBase2();
     ShiftAmt += TZeros;
 
-    // assert(ShiftAmt < BitWidth &&
-    //        "multiply-by-constant generated out of bounds shift");
     SDValue Shl =
         DAG.getNode(ISD::SHL, DL, VT, LHS, DAG.getConstant(ShiftAmt, DL, VT));
+
+    SDValue DoubleShl = DAG.getNode(
+        MathOp, DL, VT, Shl,
+        DAG.getNode(ISD::SHL, DL, VT, LHS, DAG.getConstant(TZeros, DL, VT)));
     SDValue Combined =
-        TZeros ? DAG.getNode(MathOp, DL, VT, Shl,
-                             DAG.getNode(ISD::SHL, DL, VT, LHS,
-                                         DAG.getConstant(TZeros, DL, VT)))
-               : DAG.getNode(MathOp, DL, VT, Shl, LHS);
+        TZeros ? DoubleShl : DAG.getNode(MathOp, DL, VT, Shl, LHS);
     if (IsNegative)
       Combined = DAG.getNegative(Combined, DL, VT);
     return Combined;

>From ed41fe2e7aa6bfc1a28e1f2682d5a3ecc5a564f9 Mon Sep 17 00:00:00 2001
From: AlyElashram <alyahelashram at gmail.com>
Date: Wed, 25 Mar 2026 22:55:21 +0200
Subject: [PATCH 9/9] Format as newLHS and combined

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9a49309212121..8602422e35974 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5974,7 +5974,7 @@ static SDValue convertMulToShlAdd(SDNode *N, SelectionDAG &DAG) {
   unsigned MathOp = ISD::DELETED_NODE;
   unsigned TZeros = SplatValue.countr_zero();
 
-  // Shift the splat value by all the zeros, this won't affect the parrity
+  // Shift the splat value by all the zeros, this won't affect the parity
   // this will help us find the first and second multiple to use.
   SplatValue.lshrInPlace(TZeros);
 
@@ -5999,11 +5999,10 @@ static SDValue convertMulToShlAdd(SDNode *N, SelectionDAG &DAG) {
     SDValue Shl =
         DAG.getNode(ISD::SHL, DL, VT, LHS, DAG.getConstant(ShiftAmt, DL, VT));
 
-    SDValue DoubleShl = DAG.getNode(
-        MathOp, DL, VT, Shl,
-        DAG.getNode(ISD::SHL, DL, VT, LHS, DAG.getConstant(TZeros, DL, VT)));
-    SDValue Combined =
-        TZeros ? DoubleShl : DAG.getNode(MathOp, DL, VT, Shl, LHS);
+    SDValue NewLHS = TZeros ? DAG.getNode(ISD::SHL, DL, VT, LHS,
+                                          DAG.getConstant(TZeros, DL, VT))
+                            : LHS;
+    SDValue Combined = DAG.getNode(MathOp, DL, VT, Shl, NewLHS);
     if (IsNegative)
       Combined = DAG.getNegative(Combined, DL, VT);
     return Combined;