[llvm] [AArch64] Optimized rdsvl followed by constant mul (PR #162853)

Wed Oct 22 04:15:02 PDT 2025

https://github.com/Lukacma updated https://github.com/llvm/llvm-project/pull/162853

>From 0ff95f528147c7662bc97f3f738ae4d9fccbb2fa Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Fri, 10 Oct 2025 14:14:25 +0000
Subject: [PATCH 1/2] [AArch64]  Optimized rdsvl followed by constant mul

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 41 ++++++++
 .../CodeGen/AArch64/sme-intrinsics-rdsvl.ll   | 94 +++++++++++++++++++
 2 files changed, 135 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index dc8e7c84f5e2c..1877b13a27c30 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -19579,6 +19579,47 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
        if (ConstValue.sge(1) && ConstValue.sle(16))
          return SDValue();
 
+  // Multiplying an RDSVL value by a constant can sometimes be done cheaper by
+  // folding a power-of-two factor of the constant into the RDSVL immediate and
+  // compensating with an extra shift.
+  //
+  // We rewrite:
+  //   (mul (srl (rdsvl 1), 3), x)
+  // to one of:
+  //   (shl (rdsvl y),  z)   if z > 0
+  //   (srl (rdsvl y), abs(z))   if z < 0
+  // where integers y, z satisfy   x = y * 2^(3 + z)   and   y ∈ [-32, 31].
+  if ((N0->getOpcode() == ISD::SRL) &&
+      (N0->getOperand(0).getOpcode() == AArch64ISD::RDSVL)) {
+    unsigned AbsConstValue = ConstValue.abs().getZExtValue();
+
+    // z ≤ ctz(|x|) - 3  (largest extra shift we can take while keeping y
+    // integral)
+    int UpperBound = llvm::countr_zero(AbsConstValue) - 3;
+
+    // To keep y in range, with B = 31 for x > 0 and B = 32 for x < 0, we need:
+    // 2^(3 + z) ≥ ceil(x / B)  ⇒  z ≥ ceil_log2(ceil(x / B)) - 3  (LowerBound).
+    unsigned B = ConstValue.isNegative() ? 32 : 31;
+    unsigned CeilAxOverB = (AbsConstValue + (B - 1)) / B; // ceil(|x|/B)
+    int LowerBound = llvm::Log2_32_Ceil(CeilAxOverB) - 3;
+
+    // If solution exists, apply optimization.
+    if (LowerBound <= UpperBound) {
+
+      int Shift = std::min(std::max(/*prefer*/ 0, LowerBound), UpperBound);
+      int32_t RdsvlMul =
+          (AbsConstValue >> (3 + Shift)) * (ConstValue.isNegative() ? -1 : 1);
+      auto Rdsvl = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
+                               DAG.getSignedConstant(RdsvlMul, DL, MVT::i32));
+
+      if (Shift == 0)
+        return Rdsvl;
+      return DAG.getNode(Shift < 0 ? ISD::SRL : ISD::SHL, DL, VT, Rdsvl,
+                         DAG.getConstant(abs(Shift), DL, MVT::i32),
+                         SDNodeFlags::Exact);
+    }
+  }
+
   // Multiplication of a power of two plus/minus one can be done more
   // cheaply as shift+add/sub. For now, this is true unilaterally. If
   // future CPUs have a cheaper MADD instruction, this may need to be
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll
index 06c53d8070781..ea0057e4cfdef 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll
@@ -86,4 +86,98 @@ define i64 @sme_cntsd_mul() {
   ret i64 %res
 }
 
+define i64 @sme_cntsb_mul_pos() {
+; CHECK-LABEL: sme_cntsb_mul_pos:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #24
+; CHECK-NEXT:    lsl x0, x8, #2
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %shl = shl nuw nsw i64 %v, 3
+  %res = mul nuw nsw i64 %shl, 96
+  ret i64 %res
+}
+
+define i64 @sme_cntsh_mul_pos() {
+; CHECK-LABEL: sme_cntsh_mul_pos:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #3
+; CHECK-NEXT:    lsr x0, x8, #1
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %shl = shl nuw nsw i64 %v, 2
+  %res = mul nuw nsw i64 %shl, 3
+  ret i64 %res
+}
+
+define i64 @sme_cntsw_mul_pos() {
+; CHECK-LABEL: sme_cntsw_mul_pos:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #31
+; CHECK-NEXT:    lsr x0, x8, #1
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %shl = shl nuw nsw i64 %v, 1
+  %res = mul nuw nsw i64 %shl, 62
+  ret i64 %res
+}
+
+define i64 @sme_cntsd_mul_pos() {
+; CHECK-LABEL: sme_cntsd_mul_pos:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #31
+; CHECK-NEXT:    lsl x0, x8, #2
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %res = mul nuw nsw i64 %v, 992
+  ret i64 %res
+}
+
+define i64 @sme_cntsb_mul_neg() {
+; CHECK-LABEL: sme_cntsb_mul_neg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #-24
+; CHECK-NEXT:    lsl x0, x8, #2
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %shl = shl nuw nsw i64 %v, 3
+  %res = mul nuw nsw i64 %shl, -96
+  ret i64 %res
+}
+
+define i64 @sme_cntsh_mul_neg() {
+; CHECK-LABEL: sme_cntsh_mul_neg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #-3
+; CHECK-NEXT:    lsr x0, x8, #1
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %shl = shl nuw nsw i64 %v, 2
+  %res = mul nuw nsw i64 %shl, -3
+  ret i64 %res
+}
+
+define i64 @sme_cntsw_mul_neg() {
+; CHECK-LABEL: sme_cntsw_mul_neg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #-31
+; CHECK-NEXT:    lsl x0, x8, #3
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %shl = shl nuw nsw i64 %v, 1
+  %res = mul nuw nsw i64 %shl, -992
+  ret i64 %res
+}
+
+define i64 @sme_cntsd_mul_neg() {
+; CHECK-LABEL: sme_cntsd_mul_neg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #-3
+; CHECK-NEXT:    lsr x0, x8, #3
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %res = mul nuw nsw  i64 %v, -3
+  ret i64 %res
+}
+
 declare i64 @llvm.aarch64.sme.cntsd()

>From be91b87f1add08b15640fbd204bafdc39c4f72bb Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Wed, 22 Oct 2025 11:14:43 +0000
Subject: [PATCH 2/2] Make the optimization more general and refactor it

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 99 +++++++++++--------
 1 file changed, 58 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1877b13a27c30..ead7dc8be82cc 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -19440,6 +19440,61 @@ static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
                      Op1 ? Op1 : Mul->getOperand(1));
 }
 
+// Multiplying an RDSVL value by a constant can sometimes be done cheaper by
+// folding a power-of-two factor of the constant into the RDSVL immediate and
+// compensating with an extra shift.
+//
+// We rewrite:
+//   (mul (srl (rdsvl 1), w), x)
+// to one of:
+//   (shl (rdsvl y),  z)   if z > 0
+//   (srl (rdsvl y), abs(z))   if z < 0
+// where integers y, z satisfy   x = y * 2^(w + z)   and   y ∈ [-32, 31].
+static SDValue performMulRdsvlCombine(SDNode *Mul, SelectionDAG &DAG) {
+  SDLoc DL(Mul);
+  EVT VT = Mul->getValueType(0);
+  SDValue MulOp0 = Mul->getOperand(0);
+  int ConstMultiplier =
+      cast<ConstantSDNode>(Mul->getOperand(1))->getSExtValue();
+  if ((MulOp0->getOpcode() != ISD::SRL) ||
+      (MulOp0->getOperand(0).getOpcode() != AArch64ISD::RDSVL))
+    return SDValue();
+
+  unsigned AbsConstValue = abs(ConstMultiplier);
+  unsigned OperandShift =
+      cast<ConstantSDNode>(MulOp0->getOperand(1))->getZExtValue();
+
+  // z ≤ ctz(|x|) - w  (largest extra shift we can take while keeping y
+  // integral)
+  int UpperBound = llvm::countr_zero(AbsConstValue) - OperandShift;
+
+  // To keep y in range, with B = 31 for x > 0 and B = 32 for x < 0, we need:
+  // 2^(w + z) ≥ ceil(x / B)  ⇒  z ≥ ceil_log2(ceil(x / B)) - w  (LowerBound).
+  unsigned B = ConstMultiplier < 0 ? 32 : 31;
+  unsigned CeilAxOverB = (AbsConstValue + (B - 1)) / B; // ceil(|x|/B)
+  int LowerBound = llvm::Log2_32_Ceil(CeilAxOverB) - OperandShift;
+
+  // No valid solution found.
+  if (LowerBound > UpperBound)
+    return SDValue();
+
+  // Any value of z in [LowerBound, UpperBound] is valid. Prefer no extra
+  // shift if possible.
+  int Shift = std::min(std::max(/*prefer*/ 0, LowerBound), UpperBound);
+
+  // y = x / 2^(w + z)
+  int32_t RdsvlMul = (AbsConstValue >> (OperandShift + Shift)) *
+                     (ConstMultiplier < 0 ? -1 : 1);
+  auto Rdsvl = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
+                           DAG.getSignedConstant(RdsvlMul, DL, MVT::i32));
+
+  if (Shift == 0)
+    return Rdsvl;
+  return DAG.getNode(Shift < 0 ? ISD::SRL : ISD::SHL, DL, VT, Rdsvl,
+                     DAG.getConstant(abs(Shift), DL, MVT::i32),
+                     SDNodeFlags::Exact);
+}
+
 // Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
 // Same for other types with equivalent constants.
 static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) {
@@ -19568,6 +19623,9 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
   if (!isa<ConstantSDNode>(N1))
     return SDValue();
 
+  if (SDValue Ext = performMulRdsvlCombine(N, DAG))
+    return Ext;
+
   ConstantSDNode *C = cast<ConstantSDNode>(N1);
   const APInt &ConstValue = C->getAPIntValue();
 
@@ -19579,47 +19637,6 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
        if (ConstValue.sge(1) && ConstValue.sle(16))
          return SDValue();
 
-  // Multiplying an RDSVL value by a constant can sometimes be done cheaper by
-  // folding a power-of-two factor of the constant into the RDSVL immediate and
-  // compensating with an extra shift.
-  //
-  // We rewrite:
-  //   (mul (srl (rdsvl 1), 3), x)
-  // to one of:
-  //   (shl (rdsvl y),  z)   if z > 0
-  //   (srl (rdsvl y), abs(z))   if z < 0
-  // where integers y, z satisfy   x = y * 2^(3 + z)   and   y ∈ [-32, 31].
-  if ((N0->getOpcode() == ISD::SRL) &&
-      (N0->getOperand(0).getOpcode() == AArch64ISD::RDSVL)) {
-    unsigned AbsConstValue = ConstValue.abs().getZExtValue();
-
-    // z ≤ ctz(|x|) - 3  (largest extra shift we can take while keeping y
-    // integral)
-    int UpperBound = llvm::countr_zero(AbsConstValue) - 3;
-
-    // To keep y in range, with B = 31 for x > 0 and B = 32 for x < 0, we need:
-    // 2^(3 + z) ≥ ceil(x / B)  ⇒  z ≥ ceil_log2(ceil(x / B)) - 3  (LowerBound).
-    unsigned B = ConstValue.isNegative() ? 32 : 31;
-    unsigned CeilAxOverB = (AbsConstValue + (B - 1)) / B; // ceil(|x|/B)
-    int LowerBound = llvm::Log2_32_Ceil(CeilAxOverB) - 3;
-
-    // If solution exists, apply optimization.
-    if (LowerBound <= UpperBound) {
-
-      int Shift = std::min(std::max(/*prefer*/ 0, LowerBound), UpperBound);
-      int32_t RdsvlMul =
-          (AbsConstValue >> (3 + Shift)) * (ConstValue.isNegative() ? -1 : 1);
-      auto Rdsvl = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
-                               DAG.getSignedConstant(RdsvlMul, DL, MVT::i32));
-
-      if (Shift == 0)
-        return Rdsvl;
-      return DAG.getNode(Shift < 0 ? ISD::SRL : ISD::SHL, DL, VT, Rdsvl,
-                         DAG.getConstant(abs(Shift), DL, MVT::i32),
-                         SDNodeFlags::Exact);
-    }
-  }
-
   // Multiplication of a power of two plus/minus one can be done more
   // cheaply as shift+add/sub. For now, this is true unilaterally. If
   // future CPUs have a cheaper MADD instruction, this may need to be