[llvm] [AArch64] Optimized rdsvl followed by constant mul (PR #162853)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 22 04:15:02 PDT 2025
https://github.com/Lukacma updated https://github.com/llvm/llvm-project/pull/162853
>From 0ff95f528147c7662bc97f3f738ae4d9fccbb2fa Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Fri, 10 Oct 2025 14:14:25 +0000
Subject: [PATCH 1/2] [AArch64] Optimized rdsvl followed by constant mul
---
.../Target/AArch64/AArch64ISelLowering.cpp | 41 ++++++++
.../CodeGen/AArch64/sme-intrinsics-rdsvl.ll | 94 +++++++++++++++++++
2 files changed, 135 insertions(+)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index dc8e7c84f5e2c..1877b13a27c30 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -19579,6 +19579,47 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
if (ConstValue.sge(1) && ConstValue.sle(16))
return SDValue();
+ // Multiplying an RDSVL value by a constant can sometimes be done cheaper by
+ // folding a power-of-two factor of the constant into the RDSVL immediate and
+ // compensating with an extra shift.
+ //
+ // We rewrite:
+ // (mul (srl (rdsvl 1), 3), x)
+ // to one of:
+ // (shl (rdsvl y), z) if z > 0
+ // (srl (rdsvl y), abs(z)) if z < 0
+ // where integers y, z satisfy x = y * 2^(3 + z) and y ∈ [-32, 31].
+ if ((N0->getOpcode() == ISD::SRL) &&
+ (N0->getOperand(0).getOpcode() == AArch64ISD::RDSVL)) {
+ unsigned AbsConstValue = ConstValue.abs().getZExtValue();
+
+ // z ≤ ctz(|x|) - 3 (largest extra shift we can take while keeping y
+ // integral)
+ int UpperBound = llvm::countr_zero(AbsConstValue) - 3;
+
+ // To keep y in range, with B = 31 for x > 0 and B = 32 for x < 0, we need:
+ // 2^(3 + z) ≥ ceil(x / B) ⇒ z ≥ ceil_log2(ceil(x / B)) - 3 (LowerBound).
+ unsigned B = ConstValue.isNegative() ? 32 : 31;
+ unsigned CeilAxOverB = (AbsConstValue + (B - 1)) / B; // ceil(|x|/B)
+ int LowerBound = llvm::Log2_32_Ceil(CeilAxOverB) - 3;
+
+ // If solution exists, apply optimization.
+ if (LowerBound <= UpperBound) {
+
+ int Shift = std::min(std::max(/*prefer*/ 0, LowerBound), UpperBound);
+ int32_t RdsvlMul =
+ (AbsConstValue >> (3 + Shift)) * (ConstValue.isNegative() ? -1 : 1);
+ auto Rdsvl = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
+ DAG.getSignedConstant(RdsvlMul, DL, MVT::i32));
+
+ if (Shift == 0)
+ return Rdsvl;
+ return DAG.getNode(Shift < 0 ? ISD::SRL : ISD::SHL, DL, VT, Rdsvl,
+ DAG.getConstant(abs(Shift), DL, MVT::i32),
+ SDNodeFlags::Exact);
+ }
+ }
+
// Multiplication of a power of two plus/minus one can be done more
// cheaply as shift+add/sub. For now, this is true unilaterally. If
// future CPUs have a cheaper MADD instruction, this may need to be
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll
index 06c53d8070781..ea0057e4cfdef 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll
@@ -86,4 +86,98 @@ define i64 @sme_cntsd_mul() {
ret i64 %res
}
+define i64 @sme_cntsb_mul_pos() {
+; CHECK-LABEL: sme_cntsb_mul_pos:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #24
+; CHECK-NEXT: lsl x0, x8, #2
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %shl = shl nuw nsw i64 %v, 3
+ %res = mul nuw nsw i64 %shl, 96
+ ret i64 %res
+}
+
+define i64 @sme_cntsh_mul_pos() {
+; CHECK-LABEL: sme_cntsh_mul_pos:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #3
+; CHECK-NEXT: lsr x0, x8, #1
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %shl = shl nuw nsw i64 %v, 2
+ %res = mul nuw nsw i64 %shl, 3
+ ret i64 %res
+}
+
+define i64 @sme_cntsw_mul_pos() {
+; CHECK-LABEL: sme_cntsw_mul_pos:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #31
+; CHECK-NEXT: lsr x0, x8, #1
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %shl = shl nuw nsw i64 %v, 1
+ %res = mul nuw nsw i64 %shl, 62
+ ret i64 %res
+}
+
+define i64 @sme_cntsd_mul_pos() {
+; CHECK-LABEL: sme_cntsd_mul_pos:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #31
+; CHECK-NEXT: lsl x0, x8, #2
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %res = mul nuw nsw i64 %v, 992
+ ret i64 %res
+}
+
+define i64 @sme_cntsb_mul_neg() {
+; CHECK-LABEL: sme_cntsb_mul_neg:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #-24
+; CHECK-NEXT: lsl x0, x8, #2
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %shl = shl nuw nsw i64 %v, 3
+ %res = mul nuw nsw i64 %shl, -96
+ ret i64 %res
+}
+
+define i64 @sme_cntsh_mul_neg() {
+; CHECK-LABEL: sme_cntsh_mul_neg:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #-3
+; CHECK-NEXT: lsr x0, x8, #1
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %shl = shl nuw nsw i64 %v, 2
+ %res = mul nuw nsw i64 %shl, -3
+ ret i64 %res
+}
+
+define i64 @sme_cntsw_mul_neg() {
+; CHECK-LABEL: sme_cntsw_mul_neg:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #-31
+; CHECK-NEXT: lsl x0, x8, #3
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %shl = shl nuw nsw i64 %v, 1
+ %res = mul nuw nsw i64 %shl, -992
+ ret i64 %res
+}
+
+define i64 @sme_cntsd_mul_neg() {
+; CHECK-LABEL: sme_cntsd_mul_neg:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #-3
+; CHECK-NEXT: lsr x0, x8, #3
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %res = mul nuw nsw i64 %v, -3
+ ret i64 %res
+}
+
declare i64 @llvm.aarch64.sme.cntsd()
>From be91b87f1add08b15640fbd204bafdc39c4f72bb Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Wed, 22 Oct 2025 11:14:43 +0000
Subject: [PATCH 2/2] Make the optimization more general and refactor it
---
.../Target/AArch64/AArch64ISelLowering.cpp | 99 +++++++++++--------
1 file changed, 58 insertions(+), 41 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1877b13a27c30..ead7dc8be82cc 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -19440,6 +19440,61 @@ static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
Op1 ? Op1 : Mul->getOperand(1));
}
+// Multiplying an RDSVL value by a constant can sometimes be done cheaper by
+// folding a power-of-two factor of the constant into the RDSVL immediate and
+// compensating with an extra shift.
+//
+// We rewrite:
+// (mul (srl (rdsvl 1), w), x)
+// to one of:
+// (shl (rdsvl y), z) if z > 0
+// (srl (rdsvl y), abs(z)) if z < 0
+// where integers y, z satisfy x = y * 2^(w + z) and y ∈ [-32, 31].
+static SDValue performMulRdsvlCombine(SDNode *Mul, SelectionDAG &DAG) {
+ SDLoc DL(Mul);
+ EVT VT = Mul->getValueType(0);
+ SDValue MulOp0 = Mul->getOperand(0);
+ int ConstMultiplier =
+ cast<ConstantSDNode>(Mul->getOperand(1))->getSExtValue();
+ if ((MulOp0->getOpcode() != ISD::SRL) ||
+ (MulOp0->getOperand(0).getOpcode() != AArch64ISD::RDSVL))
+ return SDValue();
+
+ unsigned AbsConstValue = abs(ConstMultiplier);
+ unsigned OperandShift =
+ cast<ConstantSDNode>(MulOp0->getOperand(1))->getZExtValue();
+
+ // z ≤ ctz(|x|) - w (largest extra shift we can take while keeping y
+ // integral)
+ int UpperBound = llvm::countr_zero(AbsConstValue) - OperandShift;
+
+ // To keep y in range, with B = 31 for x > 0 and B = 32 for x < 0, we need:
+ // 2^(w + z) ≥ ceil(x / B) ⇒ z ≥ ceil_log2(ceil(x / B)) - w (LowerBound).
+ unsigned B = ConstMultiplier < 0 ? 32 : 31;
+ unsigned CeilAxOverB = (AbsConstValue + (B - 1)) / B; // ceil(|x|/B)
+ int LowerBound = llvm::Log2_32_Ceil(CeilAxOverB) - OperandShift;
+
+ // No valid solution found.
+ if (LowerBound > UpperBound)
+ return SDValue();
+
+ // Any value of z in [LowerBound, UpperBound] is valid. Prefer no extra
+ // shift if possible.
+ int Shift = std::min(std::max(/*prefer*/ 0, LowerBound), UpperBound);
+
+ // y = x / 2^(w + z)
+ int32_t RdsvlMul = (AbsConstValue >> (OperandShift + Shift)) *
+ (ConstMultiplier < 0 ? -1 : 1);
+ auto Rdsvl = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
+ DAG.getSignedConstant(RdsvlMul, DL, MVT::i32));
+
+ if (Shift == 0)
+ return Rdsvl;
+ return DAG.getNode(Shift < 0 ? ISD::SRL : ISD::SHL, DL, VT, Rdsvl,
+ DAG.getConstant(abs(Shift), DL, MVT::i32),
+ SDNodeFlags::Exact);
+}
+
// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
// Same for other types with equivalent constants.
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) {
@@ -19568,6 +19623,9 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
if (!isa<ConstantSDNode>(N1))
return SDValue();
+ if (SDValue Ext = performMulRdsvlCombine(N, DAG))
+ return Ext;
+
ConstantSDNode *C = cast<ConstantSDNode>(N1);
const APInt &ConstValue = C->getAPIntValue();
@@ -19579,47 +19637,6 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
if (ConstValue.sge(1) && ConstValue.sle(16))
return SDValue();
- // Multiplying an RDSVL value by a constant can sometimes be done cheaper by
- // folding a power-of-two factor of the constant into the RDSVL immediate and
- // compensating with an extra shift.
- //
- // We rewrite:
- // (mul (srl (rdsvl 1), 3), x)
- // to one of:
- // (shl (rdsvl y), z) if z > 0
- // (srl (rdsvl y), abs(z)) if z < 0
- // where integers y, z satisfy x = y * 2^(3 + z) and y ∈ [-32, 31].
- if ((N0->getOpcode() == ISD::SRL) &&
- (N0->getOperand(0).getOpcode() == AArch64ISD::RDSVL)) {
- unsigned AbsConstValue = ConstValue.abs().getZExtValue();
-
- // z ≤ ctz(|x|) - 3 (largest extra shift we can take while keeping y
- // integral)
- int UpperBound = llvm::countr_zero(AbsConstValue) - 3;
-
- // To keep y in range, with B = 31 for x > 0 and B = 32 for x < 0, we need:
- // 2^(3 + z) ≥ ceil(x / B) ⇒ z ≥ ceil_log2(ceil(x / B)) - 3 (LowerBound).
- unsigned B = ConstValue.isNegative() ? 32 : 31;
- unsigned CeilAxOverB = (AbsConstValue + (B - 1)) / B; // ceil(|x|/B)
- int LowerBound = llvm::Log2_32_Ceil(CeilAxOverB) - 3;
-
- // If solution exists, apply optimization.
- if (LowerBound <= UpperBound) {
-
- int Shift = std::min(std::max(/*prefer*/ 0, LowerBound), UpperBound);
- int32_t RdsvlMul =
- (AbsConstValue >> (3 + Shift)) * (ConstValue.isNegative() ? -1 : 1);
- auto Rdsvl = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
- DAG.getSignedConstant(RdsvlMul, DL, MVT::i32));
-
- if (Shift == 0)
- return Rdsvl;
- return DAG.getNode(Shift < 0 ? ISD::SRL : ISD::SHL, DL, VT, Rdsvl,
- DAG.getConstant(abs(Shift), DL, MVT::i32),
- SDNodeFlags::Exact);
- }
- }
-
// Multiplication of a power of two plus/minus one can be done more
// cheaply as shift+add/sub. For now, this is true unilaterally. If
// future CPUs have a cheaper MADD instruction, this may need to be
More information about the llvm-commits
mailing list