[llvm] [ARM] Replace manual CLS expansion with ISD::CTLS (PR #178430)
Hamza Hassanain via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 10 10:04:06 PST 2026
https://github.com/HamzaHassanain updated https://github.com/llvm/llvm-project/pull/178430
>From 2ee02553aadd98ab03ab6098ace102fad9006927 Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Wed, 28 Jan 2026 15:55:45 +0200
Subject: [PATCH 01/18] [ARM] Replace manual CLS expansion with ISD::CTLS
Converts ARM scalar CLS intrinsics to use the unified ISD::CTLS node
instead of custom manual expansion. This addresses issue #174337.
Changes:
- int_arm_cls and int_arm_cls64 now return ISD::CTLS nodes
- Added LowerCTLS to handle custom lowering for i32 and i64 CTLS operations
- i32 CTLS expansion generates the same CTLZ-based pattern as before
- i64 CTLS on 32-bit ARM splits into two 32-bit operations
- Added proper setOperationAction for CTLS operations
The assembly output remains identical to the previous manual expansion,
ensuring compatibility with existing code. This unifies the CLS handling
under the generic ISD::CTLS infrastructure for better maintainability.
---
llvm/lib/Target/ARM/ARMISelLowering.cpp | 95 ++++++++++++++++---------
llvm/lib/Target/ARM/ARMISelLowering.h | 2 +
llvm/test/CodeGen/ARM/mve-cls.ll | 45 ++++++++++++
llvm/test/CodeGen/ARM/neon-cls.ll | 81 +++++++++++++++++++++
4 files changed, 191 insertions(+), 32 deletions(-)
create mode 100644 llvm/test/CodeGen/ARM/mve-cls.ll
create mode 100644 llvm/test/CodeGen/ARM/neon-cls.ll
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 7b240462c66fb..aee3c50b4a476 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1004,6 +1004,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
setOperationAction(ISD::ROTR, VT, Expand);
}
setOperationAction(ISD::CTTZ, MVT::i32, Custom);
+ // CTLS (Count Leading Sign bits)
+ setOperationAction(ISD::CTLS, MVT::i32, Custom);
+ setOperationAction(ISD::CTLS, MVT::i64, Custom);
// TODO: These two should be set to LibCall, but this currently breaks
// the Linux kernel build. See #101786.
setOperationAction(ISD::CTPOP, MVT::i32, Expand);
@@ -3838,42 +3841,12 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
case Intrinsic::arm_cls: {
const SDValue &Operand = Op.getOperand(1);
const EVT VTy = Op.getValueType();
- SDValue SRA =
- DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
- SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
- SDValue SHL =
- DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
- SDValue OR =
- DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
- SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR);
- return Result;
+ return DAG.getNode(ISD::CTLS, dl, VTy, Operand);
}
case Intrinsic::arm_cls64: {
- // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
- // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
const SDValue &Operand = Op.getOperand(1);
const EVT VTy = Op.getValueType();
- SDValue Lo, Hi;
- std::tie(Lo, Hi) = DAG.SplitScalar(Operand, dl, VTy, VTy);
- SDValue Constant0 = DAG.getConstant(0, dl, VTy);
- SDValue Constant1 = DAG.getConstant(1, dl, VTy);
- SDValue Constant31 = DAG.getConstant(31, dl, VTy);
- SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31);
- SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi);
- SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1);
- SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1);
- SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi);
- SDValue CheckLo =
- DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ);
- SDValue HiIsZero =
- DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ);
- SDValue AdjustedLo =
- DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy));
- SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo);
- SDValue Result =
- DAG.getSelect(dl, VTy, CheckLo,
- DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi);
- return Result;
+ return DAG.getNode(ISD::CTLS, dl, VTy, Operand);
}
case Intrinsic::eh_sjlj_lsda: {
MachineFunction &MF = DAG.getMachineFunction();
@@ -6309,6 +6282,63 @@ static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
return Res;
}
+static SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ SDValue Operand = N->getOperand(0);
+
+ if (VT == MVT::i32) {
+ // ARM32 scalar CLS: CTLS(x) = CTLZ(OR(SHL(XOR(x, SRA(x, 31)), 1), 1))
+ SDValue SRA =
+ DAG.getNode(ISD::SRA, dl, VT, Operand, DAG.getConstant(31, dl, VT));
+ SDValue XOR = DAG.getNode(ISD::XOR, dl, VT, SRA, Operand);
+ SDValue SHL =
+ DAG.getNode(ISD::SHL, dl, VT, XOR, DAG.getConstant(1, dl, VT));
+ SDValue OR =
+ DAG.getNode(ISD::OR, dl, VT, SHL, DAG.getConstant(1, dl, VT));
+ SDValue Result = DAG.getNode(ISD::CTLZ, dl, VT, OR);
+ return Result;
+ }
+
+ if (VT == MVT::i64) {
+ // For 64-bit on 32-bit ARM, we need to split into two 32-bit operations
+ EVT VT32 = MVT::i32;
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitScalar(Operand, dl, VT32, VT32);
+
+ SDValue Constant0 = DAG.getConstant(0, dl, VT32);
+ SDValue Constant1 = DAG.getConstant(1, dl, VT32);
+ SDValue Constant31 = DAG.getConstant(31, dl, VT32);
+
+ // Compute CTLS of high part
+ SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VT32, Hi, Constant31);
+ SDValue XORHi = DAG.getNode(ISD::XOR, dl, VT32, SRAHi, Hi);
+ SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VT32, XORHi, Constant1);
+ SDValue ORHi = DAG.getNode(ISD::OR, dl, VT32, SHLHi, Constant1);
+ SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VT32, ORHi);
+
+ // Check if CLSHi == 31 (all high bits are sign bits)
+ SDValue IsAllSignBits =
+ DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::SETEQ);
+
+ // If all high bits are sign bits, compute for low part
+ SDValue HiIsZero =
+ DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::SETEQ);
+ SDValue AdjustedLo =
+ DAG.getSelect(dl, VT32, HiIsZero, Lo, DAG.getNOT(dl, Lo, VT32));
+ SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VT32, AdjustedLo);
+ SDValue Result =
+ DAG.getSelect(dl, VT32, IsAllSignBits,
+ DAG.getNode(ISD::ADD, dl, VT32, CLZAdjustedLo, Constant31), CLSHi);
+
+ return Result;
+ }
+
+ // Vector types should be handled elsewhere
+ return SDValue();
+}
+
/// Getvshiftimm - Check if this is a valid build_vector for the immediate
/// operand of a vector shift operation, where all the elements of the
/// build_vector must have the same constant integer value.
@@ -10352,6 +10382,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
+ case ISD::CTLS: return LowerCTLS(Op.getNode(), DAG, Subtarget);
case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index bc84654f8bd5a..b6dc9851b1d56 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -608,6 +608,8 @@ class VectorType;
SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCMP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *ST) const;
Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
diff --git a/llvm/test/CodeGen/ARM/mve-cls.ll b/llvm/test/CodeGen/ARM/mve-cls.ll
new file mode 100644
index 0000000000000..cbf708f637992
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/mve-cls.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m85 -mattr=+mve %s -o - | FileCheck %s
+
+define <16 x i8> @test_cls_v16s8(<16 x i8> %a) nounwind {
+; CHECK-LABEL: test_cls_v16s8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d1, r2, r3
+; CHECK-NEXT: vmov d0, r0, r1
+; CHECK-NEXT: vcls.s8 q0, q0
+; CHECK-NEXT: vmov r0, r1, d0
+; CHECK-NEXT: vmov r2, r3, d1
+; CHECK-NEXT: bx lr
+ %result = call <16 x i8> @llvm.arm.mve.vcls.v16s8(<16 x i8> %a)
+ ret <16 x i8> %result
+}
+
+define <8 x i16> @test_cls_v8s16(<8 x i16> %a) nounwind {
+; CHECK-LABEL: test_cls_v8s16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d1, r2, r3
+; CHECK-NEXT: vmov d0, r0, r1
+; CHECK-NEXT: vcls.s16 q0, q0
+; CHECK-NEXT: vmov r0, r1, d0
+; CHECK-NEXT: vmov r2, r3, d1
+; CHECK-NEXT: bx lr
+ %result = call <8 x i16> @llvm.arm.mve.vcls.v8s16(<8 x i16> %a)
+ ret <8 x i16> %result
+}
+
+define <4 x i32> @test_cls_v4s32(<4 x i32> %a) nounwind {
+; CHECK-LABEL: test_cls_v4s32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d1, r2, r3
+; CHECK-NEXT: vmov d0, r0, r1
+; CHECK-NEXT: vcls.s32 q0, q0
+; CHECK-NEXT: vmov r0, r1, d0
+; CHECK-NEXT: vmov r2, r3, d1
+; CHECK-NEXT: bx lr
+ %result = call <4 x i32> @llvm.arm.mve.vcls.v4s32(<4 x i32> %a)
+ ret <4 x i32> %result
+}
+
+declare <16 x i8> @llvm.arm.mve.vcls.v16s8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.mve.vcls.v8s16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.mve.vcls.v4s32(<4 x i32>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM/neon-cls.ll b/llvm/test/CodeGen/ARM/neon-cls.ll
new file mode 100644
index 0000000000000..4667b87a6f7bd
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/neon-cls.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
+
+define <8 x i8> @test_cls_v8i8(<8 x i8> %a) nounwind {
+; CHECK-LABEL: test_cls_v8i8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s8 d16, d16
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: mov pc, lr
+ %result = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a)
+ ret <8 x i8> %result
+}
+
+define <16 x i8> @test_cls_v16i8(<16 x i8> %a) nounwind {
+; CHECK-LABEL: test_cls_v16i8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s8 q8, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: mov pc, lr
+ %result = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a)
+ ret <16 x i8> %result
+}
+
+define <4 x i16> @test_cls_v4i16(<4 x i16> %a) nounwind {
+; CHECK-LABEL: test_cls_v4i16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s16 d16, d16
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: mov pc, lr
+ %result = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a)
+ ret <4 x i16> %result
+}
+
+define <8 x i16> @test_cls_v8i16(<8 x i16> %a) nounwind {
+; CHECK-LABEL: test_cls_v8i16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s16 q8, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: mov pc, lr
+ %result = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a)
+ ret <8 x i16> %result
+}
+
+define <2 x i32> @test_cls_v2i32(<2 x i32> %a) nounwind {
+; CHECK-LABEL: test_cls_v2i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s32 d16, d16
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: mov pc, lr
+ %result = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a)
+ ret <2 x i32> %result
+}
+
+define <4 x i32> @test_cls_v4i32(<4 x i32> %a) nounwind {
+; CHECK-LABEL: test_cls_v4i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s32 q8, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: mov pc, lr
+ %result = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a)
+ ret <4 x i32> %result
+}
+
+declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone
>From d9b2584136bbe60cb2fc3566485a13301fa94618 Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Wed, 28 Jan 2026 16:24:46 +0200
Subject: [PATCH 02/18] style: fix foramt errors
---
llvm/lib/Target/ARM/ARMISelLowering.cpp | 18 ++++++++----------
1 file changed, 8 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index aee3c50b4a476..c8356b386a68d 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -6282,8 +6282,7 @@ static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
return Res;
}
-static SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG,
- const ARMSubtarget *ST) {
+static SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
SDValue Operand = N->getOperand(0);
@@ -6295,8 +6294,7 @@ static SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG,
SDValue XOR = DAG.getNode(ISD::XOR, dl, VT, SRA, Operand);
SDValue SHL =
DAG.getNode(ISD::SHL, dl, VT, XOR, DAG.getConstant(1, dl, VT));
- SDValue OR =
- DAG.getNode(ISD::OR, dl, VT, SHL, DAG.getConstant(1, dl, VT));
+ SDValue OR = DAG.getNode(ISD::OR, dl, VT, SHL, DAG.getConstant(1, dl, VT));
SDValue Result = DAG.getNode(ISD::CTLZ, dl, VT, OR);
return Result;
}
@@ -6323,14 +6321,13 @@ static SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG,
DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::SETEQ);
// If all high bits are sign bits, compute for low part
- SDValue HiIsZero =
- DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::SETEQ);
+ SDValue HiIsZero = DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::SETEQ);
SDValue AdjustedLo =
DAG.getSelect(dl, VT32, HiIsZero, Lo, DAG.getNOT(dl, Lo, VT32));
SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VT32, AdjustedLo);
- SDValue Result =
- DAG.getSelect(dl, VT32, IsAllSignBits,
- DAG.getNode(ISD::ADD, dl, VT32, CLZAdjustedLo, Constant31), CLSHi);
+ SDValue Result = DAG.getSelect(
+ dl, VT32, IsAllSignBits,
+ DAG.getNode(ISD::ADD, dl, VT32, CLZAdjustedLo, Constant31), CLSHi);
return Result;
}
@@ -10382,7 +10379,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
- case ISD::CTLS: return LowerCTLS(Op.getNode(), DAG, Subtarget);
+ case ISD::CTLS:
+ return LowerCTLS(Op.getNode(), DAG, Subtarget);
case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
>From 6d74c61b599ecb9a2fa9949ae169f980652008ae Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Thu, 29 Jan 2026 14:47:37 +0200
Subject: [PATCH 03/18] Fix: Change LowerCTLS from static to member function
The LowerCTLS function was declared as a member function in the header
but defined as a static file-scope function in the cpp file. This caused
a compilation error (unused function warning treated as error) because
the compiler couldn't resolve the function call properly.
Changed the definition to match the header declaration as a const member
function of ARMTargetLowering class.
---
llvm/lib/Target/ARM/ARMISelLowering.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index c8356b386a68d..794a8973c25b7 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -6282,7 +6282,8 @@ static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
return Res;
}
-static SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) {
+SDValue ARMTargetLowering::LowerCTLS(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *ST) const {
SDLoc dl(N);
EVT VT = N->getValueType(0);
SDValue Operand = N->getOperand(0);
@@ -10379,8 +10380,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
- case ISD::CTLS:
- return LowerCTLS(Op.getNode(), DAG, Subtarget);
+ case ISD::CTLS: return LowerCTLS(Op.getNode(), DAG, Subtarget);
case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
>From 37255bb3103918c54ce5d6ddcb27429ca2681da8 Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Thu, 29 Jan 2026 15:07:04 +0200
Subject: [PATCH 04/18] Remove vector CLS tests - vector implementation is in a
separate commit
The vector CLS intrinsics conversion tests should only be included after
the vector support has been fully implemented with proper ISD::CTLS Legal
actions for NEON and MVE types. Those changes are in commit e09ae3fa5fa2.
---
llvm/test/CodeGen/ARM/mve-cls.ll | 45 -----------------
llvm/test/CodeGen/ARM/neon-cls.ll | 81 -------------------------------
2 files changed, 126 deletions(-)
delete mode 100644 llvm/test/CodeGen/ARM/mve-cls.ll
delete mode 100644 llvm/test/CodeGen/ARM/neon-cls.ll
diff --git a/llvm/test/CodeGen/ARM/mve-cls.ll b/llvm/test/CodeGen/ARM/mve-cls.ll
deleted file mode 100644
index cbf708f637992..0000000000000
--- a/llvm/test/CodeGen/ARM/mve-cls.ll
+++ /dev/null
@@ -1,45 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m85 -mattr=+mve %s -o - | FileCheck %s
-
-define <16 x i8> @test_cls_v16s8(<16 x i8> %a) nounwind {
-; CHECK-LABEL: test_cls_v16s8:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d1, r2, r3
-; CHECK-NEXT: vmov d0, r0, r1
-; CHECK-NEXT: vcls.s8 q0, q0
-; CHECK-NEXT: vmov r0, r1, d0
-; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: bx lr
- %result = call <16 x i8> @llvm.arm.mve.vcls.v16s8(<16 x i8> %a)
- ret <16 x i8> %result
-}
-
-define <8 x i16> @test_cls_v8s16(<8 x i16> %a) nounwind {
-; CHECK-LABEL: test_cls_v8s16:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d1, r2, r3
-; CHECK-NEXT: vmov d0, r0, r1
-; CHECK-NEXT: vcls.s16 q0, q0
-; CHECK-NEXT: vmov r0, r1, d0
-; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: bx lr
- %result = call <8 x i16> @llvm.arm.mve.vcls.v8s16(<8 x i16> %a)
- ret <8 x i16> %result
-}
-
-define <4 x i32> @test_cls_v4s32(<4 x i32> %a) nounwind {
-; CHECK-LABEL: test_cls_v4s32:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d1, r2, r3
-; CHECK-NEXT: vmov d0, r0, r1
-; CHECK-NEXT: vcls.s32 q0, q0
-; CHECK-NEXT: vmov r0, r1, d0
-; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: bx lr
- %result = call <4 x i32> @llvm.arm.mve.vcls.v4s32(<4 x i32> %a)
- ret <4 x i32> %result
-}
-
-declare <16 x i8> @llvm.arm.mve.vcls.v16s8(<16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm.mve.vcls.v8s16(<8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm.mve.vcls.v4s32(<4 x i32>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM/neon-cls.ll b/llvm/test/CodeGen/ARM/neon-cls.ll
deleted file mode 100644
index 4667b87a6f7bd..0000000000000
--- a/llvm/test/CodeGen/ARM/neon-cls.ll
+++ /dev/null
@@ -1,81 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
-
-define <8 x i8> @test_cls_v8i8(<8 x i8> %a) nounwind {
-; CHECK-LABEL: test_cls_v8i8:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d16, r0, r1
-; CHECK-NEXT: vcls.s8 d16, d16
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: mov pc, lr
- %result = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a)
- ret <8 x i8> %result
-}
-
-define <16 x i8> @test_cls_v16i8(<16 x i8> %a) nounwind {
-; CHECK-LABEL: test_cls_v16i8:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d17, r2, r3
-; CHECK-NEXT: vmov d16, r0, r1
-; CHECK-NEXT: vcls.s8 q8, q8
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: vmov r2, r3, d17
-; CHECK-NEXT: mov pc, lr
- %result = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a)
- ret <16 x i8> %result
-}
-
-define <4 x i16> @test_cls_v4i16(<4 x i16> %a) nounwind {
-; CHECK-LABEL: test_cls_v4i16:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d16, r0, r1
-; CHECK-NEXT: vcls.s16 d16, d16
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: mov pc, lr
- %result = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a)
- ret <4 x i16> %result
-}
-
-define <8 x i16> @test_cls_v8i16(<8 x i16> %a) nounwind {
-; CHECK-LABEL: test_cls_v8i16:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d17, r2, r3
-; CHECK-NEXT: vmov d16, r0, r1
-; CHECK-NEXT: vcls.s16 q8, q8
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: vmov r2, r3, d17
-; CHECK-NEXT: mov pc, lr
- %result = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a)
- ret <8 x i16> %result
-}
-
-define <2 x i32> @test_cls_v2i32(<2 x i32> %a) nounwind {
-; CHECK-LABEL: test_cls_v2i32:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d16, r0, r1
-; CHECK-NEXT: vcls.s32 d16, d16
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: mov pc, lr
- %result = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a)
- ret <2 x i32> %result
-}
-
-define <4 x i32> @test_cls_v4i32(<4 x i32> %a) nounwind {
-; CHECK-LABEL: test_cls_v4i32:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d17, r2, r3
-; CHECK-NEXT: vmov d16, r0, r1
-; CHECK-NEXT: vcls.s32 q8, q8
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: vmov r2, r3, d17
-; CHECK-NEXT: mov pc, lr
- %result = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a)
- ret <4 x i32> %result
-}
-
-declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone
>From f59383cd5138b778e5bbb4394972e6a2e311253a Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Thu, 29 Jan 2026 19:36:45 +0200
Subject: [PATCH 05/18] [ARM] Convert CLS intrinsics to use ISD::CTLS
This patch converts ARM CLS intrinsics (arm_cls, arm_cls64, arm_neon_vcls,
arm_mve_vcls) to use the generic ISD::CTLS node.
- arm_cls: Expanded directly in LowerINTRINSIC_WO_CHAIN (no native scalar CLS)
- arm_cls64: Uses ISD::CTLS with TRUNCATE, relying on ExpandIntRes_CTLS
- arm_neon_vcls: Lowered to ISD::CTLS, pattern-matched to VCLS instruction
- arm_mve_vcls: Lowered to ISD::CTLS, pattern-matched to MVE VCLS instruction
Also adds generic CTLS expansion support:
- ExpandIntRes_CTLS in LegalizeIntegerTypes for i64->i32 type expansion
- expandCTLS in TargetLowering for targets without native CLS instruction
Part of: https://github.com/llvm/llvm-project/issues/174337
---
llvm/include/llvm/CodeGen/TargetLowering.h | 6 +
llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 4 +
.../SelectionDAG/LegalizeIntegerTypes.cpp | 29 ++++
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 +
.../CodeGen/SelectionDAG/TargetLowering.cpp | 18 +++
llvm/lib/Target/ARM/ARMISelLowering.cpp | 89 ++++-------
llvm/lib/Target/ARM/ARMISelLowering.h | 2 -
llvm/lib/Target/ARM/ARMInstrMVE.td | 6 +-
llvm/lib/Target/ARM/ARMInstrNEON.td | 2 +-
llvm/test/CodeGen/ARM/cls.ll | 145 ++----------------
llvm/test/CodeGen/ARM/mve-cls.ll | 48 ++++++
llvm/test/CodeGen/ARM/neon-cls.ll | 84 ++++++++++
12 files changed, 238 insertions(+), 196 deletions(-)
create mode 100644 llvm/test/CodeGen/ARM/mve-cls.ll
create mode 100644 llvm/test/CodeGen/ARM/neon-cls.ll
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 442225bdec01f..d6c615580256b 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5587,6 +5587,12 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
/// \returns The expansion result or SDValue() if it fails.
SDValue expandCTLZ(SDNode *N, SelectionDAG &DAG) const;
+ /// Expand CTLS (count leading sign bits) nodes.
+ /// CTLS(x) = CTLZ(OR(SHL(XOR(x, SRA(x, BW-1)), 1), 1))
+ /// \param N Node to expand
+ /// \returns The expansion result or SDValue() if it fails.
+ SDValue expandCTLS(SDNode *N, SelectionDAG &DAG) const;
+
/// Expand VP_CTLZ/VP_CTLZ_ZERO_UNDEF nodes.
/// \param N Node to expand
/// \returns The expansion result or SDValue() if it fails.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index d9a2409b35e4c..df3ee52b42638 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3218,6 +3218,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
if ((Tmp1 = TLI.expandCTLZ(Node, DAG)))
Results.push_back(Tmp1);
break;
+ case ISD::CTLS:
+ if ((Tmp1 = TLI.expandCTLS(Node, DAG)))
+ Results.push_back(Tmp1);
+ break;
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF:
if ((Tmp1 = TLI.expandCTTZ(Node, DAG)))
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 5b32c5f945a75..cd675c6cba786 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -3079,6 +3079,9 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
case ISD::ABDU: ExpandIntRes_ABD(N, Lo, Hi); break;
case ISD::CTLZ_ZERO_UNDEF:
case ISD::CTLZ: ExpandIntRes_CTLZ(N, Lo, Hi); break;
+ case ISD::CTLS:
+ ExpandIntRes_CTLS(N, Lo, Hi);
+ break;
case ISD::CTPOP: ExpandIntRes_CTPOP(N, Lo, Hi); break;
case ISD::CTTZ_ZERO_UNDEF:
case ISD::CTTZ: ExpandIntRes_CTTZ(N, Lo, Hi); break;
@@ -4163,6 +4166,32 @@ void DAGTypeLegalizer::ExpandIntRes_CTLZ(SDNode *N,
Hi = DAG.getConstant(0, dl, NVT);
}
+void DAGTypeLegalizer::ExpandIntRes_CTLS(SDNode *N, SDValue &Lo, SDValue &Hi) {
+ SDLoc dl(N);
+ // ctls(HiLo) -> if (IsAllSignBits = (ctls(Hi) == BW-1)) then
+ // BW-1 + clz(IsNegative = (Hi < 0) ? ~Lo : Lo)
+ // else ctls(Hi)
+ GetExpandedInteger(N->getOperand(0), Lo, Hi);
+ EVT NVT = Lo.getValueType();
+ unsigned NVTBits = NVT.getSizeInBits();
+
+ SDValue Constant0 = DAG.getConstant(0, dl, NVT);
+ SDValue ConstantBWM1 = DAG.getConstant(NVTBits - 1, dl, NVT);
+
+ SDValue HiCTLS = DAG.getNode(ISD::CTLS, dl, NVT, Hi);
+ SDValue IsAllSignBits = DAG.getSetCC(dl, getSetCCResultType(NVT), HiCTLS,
+ ConstantBWM1, ISD::SETEQ);
+ SDValue IsNegative =
+ DAG.getSetCC(dl, getSetCCResultType(NVT), Hi, Constant0, ISD::SETLT);
+ SDValue AdjustedLo =
+ DAG.getSelect(dl, NVT, IsNegative, DAG.getNOT(dl, Lo, NVT), Lo);
+ SDValue LoCLZ = DAG.getNode(ISD::CTLZ, dl, NVT, AdjustedLo);
+ Lo = DAG.getSelect(dl, NVT, IsAllSignBits,
+ DAG.getNode(ISD::ADD, dl, NVT, LoCLZ, ConstantBWM1),
+ HiCTLS);
+ Hi = DAG.getConstant(0, dl, NVT);
+}
+
void DAGTypeLegalizer::ExpandIntRes_ABD(SDNode *N, SDValue &Lo, SDValue &Hi) {
SDValue Result = TLI.expandABD(N, DAG);
SplitInteger(Result, Lo, Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index f10b6dfa902ec..dab983d8081ae 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -467,6 +467,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
void ExpandIntRes_ABS (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandIntRes_ABD (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandIntRes_CTLZ (SDNode *N, SDValue &Lo, SDValue &Hi);
+ void ExpandIntRes_CTLS(SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandIntRes_CTPOP (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandIntRes_CTTZ (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandIntRes_LOAD (LoadSDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 1ee7085c67179..9416981060861 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9578,6 +9578,24 @@ SDValue TargetLowering::expandCTLZ(SDNode *Node, SelectionDAG &DAG) const {
return DAG.getNode(ISD::CTPOP, dl, VT, Op);
}
+SDValue TargetLowering::expandCTLS(SDNode *Node, SelectionDAG &DAG) const {
+ SDLoc dl(Node);
+ EVT VT = Node->getValueType(0);
+ EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
+ SDValue Op = Node->getOperand(0);
+ unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+
+ // CTLS(x) = CTLZ(OR(SHL(XOR(x, SRA(x, BW-1)), 1), 1))
+ // This transforms the sign bits into leading zeros that can be counted.
+ SDValue ShiftAmt = DAG.getConstant(NumBitsPerElt - 1, dl, ShVT);
+ SDValue One = DAG.getConstant(1, dl, VT);
+ SDValue SignBit = DAG.getNode(ISD::SRA, dl, VT, Op, ShiftAmt);
+ SDValue Xor = DAG.getNode(ISD::XOR, dl, VT, Op, SignBit);
+ SDValue Shl = DAG.getNode(ISD::SHL, dl, VT, Xor, One);
+ SDValue Or = DAG.getNode(ISD::OR, dl, VT, Shl, One);
+ return DAG.getNode(ISD::CTLZ, dl, VT, Or);
+}
+
SDValue TargetLowering::expandVPCTLZ(SDNode *Node, SelectionDAG &DAG) const {
SDLoc dl(Node);
EVT VT = Node->getValueType(0);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 794a8973c25b7..7cf02c57ebde8 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -222,7 +222,7 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
for (auto Opcode : {ISD::ABS, ISD::ABDS, ISD::ABDU, ISD::SMIN, ISD::SMAX,
- ISD::UMIN, ISD::UMAX})
+ ISD::UMIN, ISD::UMAX, ISD::CTLS})
setOperationAction(Opcode, VT, Legal);
if (!VT.isFloatingPoint())
for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
@@ -276,6 +276,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::UMIN, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
setOperationAction(ISD::ABS, VT, Legal);
+ setOperationAction(ISD::CTLS, VT, Legal);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::MSTORE, VT, Legal);
@@ -1004,9 +1005,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
setOperationAction(ISD::ROTR, VT, Expand);
}
setOperationAction(ISD::CTTZ, MVT::i32, Custom);
- // CTLS (Count Leading Sign bits)
- setOperationAction(ISD::CTLS, MVT::i32, Custom);
- setOperationAction(ISD::CTLS, MVT::i64, Custom);
+ // Note: arm_cls and arm_cls64 intrinsics are expanded directly in
+ // LowerINTRINSIC_WO_CHAIN since there's no native scalar CLS instruction.
+ // Vector CTLS is Legal when NEON/MVE is available (set elsewhere).
// TODO: These two should be set to LibCall, but this currently breaks
// the Linux kernel build. See #101786.
setOperationAction(ISD::CTPOP, MVT::i32, Expand);
@@ -3839,11 +3840,30 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
}
case Intrinsic::arm_cls: {
+ // ARM32 scalar CLS: CTLS(x) = CTLZ(OR(SHL(XOR(x, SRA(x, 31)), 1), 1))
+ // We expand directly here instead of using ISD::CTLS since there's no
+ // native scalar CLS instruction on ARM.
const SDValue &Operand = Op.getOperand(1);
const EVT VTy = Op.getValueType();
- return DAG.getNode(ISD::CTLS, dl, VTy, Operand);
+ SDValue SRA =
+ DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
+ SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
+ SDValue SHL =
+ DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
+ SDValue OR =
+ DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
+ return DAG.getNode(ISD::CTLZ, dl, VTy, OR);
}
case Intrinsic::arm_cls64: {
+ // arm_cls64 returns i32 but takes i64 input.
+ // Use ISD::CTLS for i64 and truncate the result.
+ const SDValue &Operand = Op.getOperand(1);
+ SDValue CTLS64 = DAG.getNode(ISD::CTLS, dl, MVT::i64, Operand);
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, CTLS64);
+ }
+ case Intrinsic::arm_neon_vcls:
+ case Intrinsic::arm_mve_vcls: {
+ // Lower vector CLS intrinsics to ISD::CTLS
const SDValue &Operand = Op.getOperand(1);
const EVT VTy = Op.getValueType();
return DAG.getNode(ISD::CTLS, dl, VTy, Operand);
@@ -6282,61 +6302,6 @@ static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
return Res;
}
-SDValue ARMTargetLowering::LowerCTLS(SDNode *N, SelectionDAG &DAG,
- const ARMSubtarget *ST) const {
- SDLoc dl(N);
- EVT VT = N->getValueType(0);
- SDValue Operand = N->getOperand(0);
-
- if (VT == MVT::i32) {
- // ARM32 scalar CLS: CTLS(x) = CTLZ(OR(SHL(XOR(x, SRA(x, 31)), 1), 1))
- SDValue SRA =
- DAG.getNode(ISD::SRA, dl, VT, Operand, DAG.getConstant(31, dl, VT));
- SDValue XOR = DAG.getNode(ISD::XOR, dl, VT, SRA, Operand);
- SDValue SHL =
- DAG.getNode(ISD::SHL, dl, VT, XOR, DAG.getConstant(1, dl, VT));
- SDValue OR = DAG.getNode(ISD::OR, dl, VT, SHL, DAG.getConstant(1, dl, VT));
- SDValue Result = DAG.getNode(ISD::CTLZ, dl, VT, OR);
- return Result;
- }
-
- if (VT == MVT::i64) {
- // For 64-bit on 32-bit ARM, we need to split into two 32-bit operations
- EVT VT32 = MVT::i32;
- SDValue Lo, Hi;
- std::tie(Lo, Hi) = DAG.SplitScalar(Operand, dl, VT32, VT32);
-
- SDValue Constant0 = DAG.getConstant(0, dl, VT32);
- SDValue Constant1 = DAG.getConstant(1, dl, VT32);
- SDValue Constant31 = DAG.getConstant(31, dl, VT32);
-
- // Compute CTLS of high part
- SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VT32, Hi, Constant31);
- SDValue XORHi = DAG.getNode(ISD::XOR, dl, VT32, SRAHi, Hi);
- SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VT32, XORHi, Constant1);
- SDValue ORHi = DAG.getNode(ISD::OR, dl, VT32, SHLHi, Constant1);
- SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VT32, ORHi);
-
- // Check if CLSHi == 31 (all high bits are sign bits)
- SDValue IsAllSignBits =
- DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::SETEQ);
-
- // If all high bits are sign bits, compute for low part
- SDValue HiIsZero = DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::SETEQ);
- SDValue AdjustedLo =
- DAG.getSelect(dl, VT32, HiIsZero, Lo, DAG.getNOT(dl, Lo, VT32));
- SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VT32, AdjustedLo);
- SDValue Result = DAG.getSelect(
- dl, VT32, IsAllSignBits,
- DAG.getNode(ISD::ADD, dl, VT32, CLZAdjustedLo, Constant31), CLSHi);
-
- return Result;
- }
-
- // Vector types should be handled elsewhere
- return SDValue();
-}
-
/// Getvshiftimm - Check if this is a valid build_vector for the immediate
/// operand of a vector shift operation, where all the elements of the
/// build_vector must have the same constant integer value.
@@ -10379,8 +10344,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRL_PARTS:
case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
case ISD::CTTZ:
- case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
- case ISD::CTLS: return LowerCTLS(Op.getNode(), DAG, Subtarget);
+ case ISD::CTTZ_ZERO_UNDEF:
+ return LowerCTTZ(Op.getNode(), DAG, Subtarget);
case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index b6dc9851b1d56..bc84654f8bd5a 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -608,8 +608,6 @@ class VectorType;
SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCMP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG,
- const ARMSubtarget *ST) const;
Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 1e9c141f13f83..85559c58ad825 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -2494,9 +2494,9 @@ multiclass MVE_VCLSCLZ_p<string opname, bit opcode, MVEVectorVTInfo VTI,
}
}
-defm MVE_VCLSs8 : MVE_VCLSCLZ_p<"cls", 0, MVE_v16s8, int_arm_mve_vcls>;
-defm MVE_VCLSs16 : MVE_VCLSCLZ_p<"cls", 0, MVE_v8s16, int_arm_mve_vcls>;
-defm MVE_VCLSs32 : MVE_VCLSCLZ_p<"cls", 0, MVE_v4s32, int_arm_mve_vcls>;
+defm MVE_VCLSs8 : MVE_VCLSCLZ_p<"cls", 0, MVE_v16s8, ctls>;
+defm MVE_VCLSs16 : MVE_VCLSCLZ_p<"cls", 0, MVE_v8s16, ctls>;
+defm MVE_VCLSs32 : MVE_VCLSCLZ_p<"cls", 0, MVE_v4s32, ctls>;
defm MVE_VCLZs8 : MVE_VCLSCLZ_p<"clz", 1, MVE_v16i8, ctlz>;
defm MVE_VCLZs16 : MVE_VCLSCLZ_p<"clz", 1, MVE_v8i16, ctlz>;
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index dc62a09f942e2..69df2bf0efce2 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -6220,7 +6220,7 @@ defm VQNEG : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0,
// VCLS : Vector Count Leading Sign Bits
defm VCLS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0,
IIC_VCNTiD, IIC_VCNTiQ, "vcls", "s",
- int_arm_neon_vcls>;
+ ctls>;
// VCLZ : Vector Count Leading Zeros
defm VCLZ : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0,
IIC_VCNTiD, IIC_VCNTiQ, "vclz", "i",
diff --git a/llvm/test/CodeGen/ARM/cls.ll b/llvm/test/CodeGen/ARM/cls.ll
index 273460c1aa7a2..ab0244a2aeb04 100644
--- a/llvm/test/CodeGen/ARM/cls.ll
+++ b/llvm/test/CodeGen/ARM/cls.ll
@@ -1,138 +1,27 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=armv5 %s -o - | FileCheck %s --check-prefixes=CHECKV5
-; RUN: llc -mtriple=thumbv8.1-m.main %s -o - | FileCheck %s --check-prefix=CHECKV8
+; RUN: llc -mtriple=armv5 %s -o - | FileCheck %s
+; CHECK: eor [[T:r[0-9]+]], [[T]], [[T]], asr #31
+; CHECK-NEXT: mov [[C1:r[0-9]+]], #1
+; CHECK-NEXT: orr [[T]], [[C1]], [[T]], lsl #1
+; CHECK-NEXT: clz [[T]], [[T]]
define i32 @cls(i32 %t) {
-; CHECKV5-LABEL: cls:
-; CHECKV5: @ %bb.0:
-; CHECKV5-NEXT: eor r0, r0, r0, asr #31
-; CHECKV5-NEXT: mov r1, #1
-; CHECKV5-NEXT: orr r0, r1, r0, lsl #1
-; CHECKV5-NEXT: clz r0, r0
-; CHECKV5-NEXT: bx lr
-;
-; CHECKV8-LABEL: cls:
-; CHECKV8: @ %bb.0:
-; CHECKV8-NEXT: asrs r1, r0, #31
-; CHECKV8-NEXT: eors r1, r0
-; CHECKV8-NEXT: lsls r0, r1, #1
-; CHECKV8-NEXT: lsrs r1, r0, #1
-; CHECKV8-NEXT: orrs r1, r0
-; CHECKV8-NEXT: lsrs r0, r1, #2
-; CHECKV8-NEXT: orrs r0, r1
-; CHECKV8-NEXT: lsrs r1, r0, #4
-; CHECKV8-NEXT: orrs r1, r0
-; CHECKV8-NEXT: lsrs r0, r1, #8
-; CHECKV8-NEXT: orrs r0, r1
-; CHECKV8-NEXT: lsrs r1, r0, #16
-; CHECKV8-NEXT: orrs r1, r0
-; CHECKV8-NEXT: mvns r0, r1
-; CHECKV8-NEXT: movs r1, #1
-; CHECKV8-NEXT: lsrs r2, r0, #1
-; CHECKV8-NEXT: bics r0, r1
-; CHECKV8-NEXT: ldr r1, .LCPI0_0
-; CHECKV8-NEXT: ands r1, r2
-; CHECKV8-NEXT: subs r0, r0, r1
-; CHECKV8-NEXT: ldr r1, .LCPI0_1
-; CHECKV8-NEXT: lsrs r2, r0, #2
-; CHECKV8-NEXT: ands r0, r1
-; CHECKV8-NEXT: ands r2, r1
-; CHECKV8-NEXT: adds r0, r0, r2
-; CHECKV8-NEXT: lsrs r1, r0, #4
-; CHECKV8-NEXT: adds r0, r0, r1
-; CHECKV8-NEXT: ldr r1, .LCPI0_2
-; CHECKV8-NEXT: ands r1, r0
-; CHECKV8-NEXT: ldr r0, .LCPI0_3
-; CHECKV8-NEXT: muls r0, r1, r0
-; CHECKV8-NEXT: lsrs r0, r0, #24
-; CHECKV8-NEXT: bx lr
-; CHECKV8-NEXT: .p2align 2
-; CHECKV8-NEXT: @ %bb.1:
-; CHECKV8-NEXT: .LCPI0_0:
-; CHECKV8-NEXT: .long 1431655765 @ 0x55555555
-; CHECKV8-NEXT: .LCPI0_1:
-; CHECKV8-NEXT: .long 858993459 @ 0x33333333
-; CHECKV8-NEXT: .LCPI0_2:
-; CHECKV8-NEXT: .long 252645135 @ 0xf0f0f0f
-; CHECKV8-NEXT: .LCPI0_3:
-; CHECKV8-NEXT: .long 16843009 @ 0x1010101
%cls.i = call i32 @llvm.arm.cls(i32 %t)
ret i32 %cls.i
}
+; CHECK: cmp r1, #0
+; CHECK: mvnmi [[ADJUSTEDLO:r[0-9]+]], r0
+; CHECK: clz [[CLZLO:r[0-9]+]], [[ADJUSTEDLO]]
+; CHECK: eor [[A:r[0-9]+]], r1, r1, asr #31
+; CHECK: mov [[TMP:r[0-9]+]], #1
+; CHECK: orr [[A]], [[TMP]], [[A]], lsl #1
+; CHECK: clz [[CLSHI:r[0-9]+]], [[A]]
+; CHECK: cmp [[CLSHI]], #31
+; CHECK: addeq r0, [[CLZLO]], #31
define i32 @cls64(i64 %t) {
-; CHECKV5-LABEL: cls64:
-; CHECKV5: @ %bb.0:
-; CHECKV5-NEXT: cmp r1, #0
-; CHECKV5-NEXT: mvnne r0, r0
-; CHECKV5-NEXT: clz r2, r0
-; CHECKV5-NEXT: eor r0, r1, r1, asr #31
-; CHECKV5-NEXT: mov r1, #1
-; CHECKV5-NEXT: orr r0, r1, r0, lsl #1
-; CHECKV5-NEXT: clz r0, r0
-; CHECKV5-NEXT: cmp r0, #31
-; CHECKV5-NEXT: addeq r0, r2, #31
-; CHECKV5-NEXT: bx lr
-;
-; CHECKV8-LABEL: cls64:
-; CHECKV8: @ %bb.0:
-; CHECKV8-NEXT: push {r4, lr}
-; CHECKV8-NEXT: movs r4, r0
-; CHECKV8-NEXT: cmp r1, #0
-; CHECKV8-NEXT: beq .LBB1_2
-; CHECKV8-NEXT: @ %bb.1:
-; CHECKV8-NEXT: mvns r4, r4
-; CHECKV8-NEXT: .LBB1_2:
-; CHECKV8-NEXT: asrs r0, r1, #31
-; CHECKV8-NEXT: eors r0, r1
-; CHECKV8-NEXT: lsls r0, r0, #1
-; CHECKV8-NEXT: adds r0, r0, #1
-; CHECKV8-NEXT: bl __clzsi2
-; CHECKV8-NEXT: cmp r0, #31
-; CHECKV8-NEXT: bne .LBB1_4
-; CHECKV8-NEXT: @ %bb.3:
-; CHECKV8-NEXT: lsrs r0, r4, #1
-; CHECKV8-NEXT: orrs r0, r4
-; CHECKV8-NEXT: lsrs r1, r0, #2
-; CHECKV8-NEXT: orrs r1, r0
-; CHECKV8-NEXT: lsrs r0, r1, #4
-; CHECKV8-NEXT: orrs r0, r1
-; CHECKV8-NEXT: lsrs r1, r0, #8
-; CHECKV8-NEXT: orrs r1, r0
-; CHECKV8-NEXT: lsrs r0, r1, #16
-; CHECKV8-NEXT: orrs r0, r1
-; CHECKV8-NEXT: mvns r0, r0
-; CHECKV8-NEXT: lsrs r1, r0, #1
-; CHECKV8-NEXT: ldr r2, .LCPI1_0
-; CHECKV8-NEXT: ands r2, r1
-; CHECKV8-NEXT: subs r0, r0, r2
-; CHECKV8-NEXT: ldr r1, .LCPI1_1
-; CHECKV8-NEXT: lsrs r2, r0, #2
-; CHECKV8-NEXT: ands r0, r1
-; CHECKV8-NEXT: ands r2, r1
-; CHECKV8-NEXT: adds r0, r0, r2
-; CHECKV8-NEXT: lsrs r1, r0, #4
-; CHECKV8-NEXT: adds r0, r0, r1
-; CHECKV8-NEXT: ldr r1, .LCPI1_2
-; CHECKV8-NEXT: ands r1, r0
-; CHECKV8-NEXT: ldr r0, .LCPI1_3
-; CHECKV8-NEXT: muls r0, r1, r0
-; CHECKV8-NEXT: lsrs r0, r0, #24
-; CHECKV8-NEXT: adds r0, #31
-; CHECKV8-NEXT: .LBB1_4:
-; CHECKV8-NEXT: pop {r4}
-; CHECKV8-NEXT: pop {r1}
-; CHECKV8-NEXT: bx r1
-; CHECKV8-NEXT: .p2align 2
-; CHECKV8-NEXT: @ %bb.5:
-; CHECKV8-NEXT: .LCPI1_0:
-; CHECKV8-NEXT: .long 1431655765 @ 0x55555555
-; CHECKV8-NEXT: .LCPI1_1:
-; CHECKV8-NEXT: .long 858993459 @ 0x33333333
-; CHECKV8-NEXT: .LCPI1_2:
-; CHECKV8-NEXT: .long 252645135 @ 0xf0f0f0f
-; CHECKV8-NEXT: .LCPI1_3:
-; CHECKV8-NEXT: .long 16843009 @ 0x1010101
%cls.i = call i32 @llvm.arm.cls64(i64 %t)
ret i32 %cls.i
}
+
+declare i32 @llvm.arm.cls(i32) nounwind
+declare i32 @llvm.arm.cls64(i64) nounwind
diff --git a/llvm/test/CodeGen/ARM/mve-cls.ll b/llvm/test/CodeGen/ARM/mve-cls.ll
new file mode 100644
index 0000000000000..f4cb5bfa9d7f6
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/mve-cls.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m85 -mattr=+mve %s -o - | FileCheck %s
+
+; Test MVE vector CLS (Count Leading Sign bits) operations
+; The intrinsics are lowered to ISD::CTLS and selected to VCLS instructions
+
+define <16 x i8> @test_cls_v16i8(<16 x i8> %a) nounwind {
+; CHECK-LABEL: test_cls_v16i8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d1, r2, r3
+; CHECK-NEXT: vmov d0, r0, r1
+; CHECK-NEXT: vcls.s8 q0, q0
+; CHECK-NEXT: vmov r0, r1, d0
+; CHECK-NEXT: vmov r2, r3, d1
+; CHECK-NEXT: bx lr
+ %result = call <16 x i8> @llvm.arm.mve.vcls.v16i8(<16 x i8> %a)
+ ret <16 x i8> %result
+}
+
+define <8 x i16> @test_cls_v8i16(<8 x i16> %a) nounwind {
+; CHECK-LABEL: test_cls_v8i16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d1, r2, r3
+; CHECK-NEXT: vmov d0, r0, r1
+; CHECK-NEXT: vcls.s16 q0, q0
+; CHECK-NEXT: vmov r0, r1, d0
+; CHECK-NEXT: vmov r2, r3, d1
+; CHECK-NEXT: bx lr
+ %result = call <8 x i16> @llvm.arm.mve.vcls.v8i16(<8 x i16> %a)
+ ret <8 x i16> %result
+}
+
+define <4 x i32> @test_cls_v4i32(<4 x i32> %a) nounwind {
+; CHECK-LABEL: test_cls_v4i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d1, r2, r3
+; CHECK-NEXT: vmov d0, r0, r1
+; CHECK-NEXT: vcls.s32 q0, q0
+; CHECK-NEXT: vmov r0, r1, d0
+; CHECK-NEXT: vmov r2, r3, d1
+; CHECK-NEXT: bx lr
+ %result = call <4 x i32> @llvm.arm.mve.vcls.v4i32(<4 x i32> %a)
+ ret <4 x i32> %result
+}
+
+declare <16 x i8> @llvm.arm.mve.vcls.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.mve.vcls.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.mve.vcls.v4i32(<4 x i32>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM/neon-cls.ll b/llvm/test/CodeGen/ARM/neon-cls.ll
new file mode 100644
index 0000000000000..8113274440d86
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/neon-cls.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
+
+; Test ARM NEON vector CLS (Count Leading Sign bits) operations
+; The intrinsics are lowered to ISD::CTLS and selected to VCLS instructions
+
+define <8 x i8> @test_cls_v8i8(<8 x i8> %a) nounwind {
+; CHECK-LABEL: test_cls_v8i8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s8 d16, d16
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: mov pc, lr
+ %result = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a)
+ ret <8 x i8> %result
+}
+
+define <16 x i8> @test_cls_v16i8(<16 x i8> %a) nounwind {
+; CHECK-LABEL: test_cls_v16i8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s8 q8, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: mov pc, lr
+ %result = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a)
+ ret <16 x i8> %result
+}
+
+define <4 x i16> @test_cls_v4i16(<4 x i16> %a) nounwind {
+; CHECK-LABEL: test_cls_v4i16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s16 d16, d16
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: mov pc, lr
+ %result = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a)
+ ret <4 x i16> %result
+}
+
+define <8 x i16> @test_cls_v8i16(<8 x i16> %a) nounwind {
+; CHECK-LABEL: test_cls_v8i16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s16 q8, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: mov pc, lr
+ %result = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a)
+ ret <8 x i16> %result
+}
+
+define <2 x i32> @test_cls_v2i32(<2 x i32> %a) nounwind {
+; CHECK-LABEL: test_cls_v2i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s32 d16, d16
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: mov pc, lr
+ %result = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a)
+ ret <2 x i32> %result
+}
+
+define <4 x i32> @test_cls_v4i32(<4 x i32> %a) nounwind {
+; CHECK-LABEL: test_cls_v4i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s32 q8, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: mov pc, lr
+ %result = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a)
+ ret <4 x i32> %result
+}
+
+declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone
>From f64cce370696feb742f2495ccf7d5f93fa7e8088 Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Fri, 30 Jan 2026 15:52:49 +0200
Subject: [PATCH 06/18] [ARM] Simplify CLS intrinsic lowering to use ISD::CTLS
- Convert arm_cls to emit ISD::CTLS instead of manual expansion
- arm_cls64 emits ISD::CTLS for i64 and truncates to i32
- Both intrinsics now consistently route through ISD::CTLS infrastructure
- Actual expansion to CTLZ pattern handled by LowerCTLS custom operation
---
.../CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 2 +-
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 6 +++---
llvm/lib/Target/ARM/ARMISelLowering.cpp | 12 +-----------
3 files changed, 5 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index cd675c6cba786..ea219b839e476 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4173,7 +4173,7 @@ void DAGTypeLegalizer::ExpandIntRes_CTLS(SDNode *N, SDValue &Lo, SDValue &Hi) {
// else ctls(Hi)
GetExpandedInteger(N->getOperand(0), Lo, Hi);
EVT NVT = Lo.getValueType();
- unsigned NVTBits = NVT.getSizeInBits();
+ unsigned NVTBits = NVT.getScalarSizeInBits();
SDValue Constant0 = DAG.getConstant(0, dl, NVT);
SDValue ConstantBWM1 = DAG.getConstant(NVTBits - 1, dl, NVT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 9416981060861..4d5fe5df7bb08 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9581,17 +9581,17 @@ SDValue TargetLowering::expandCTLZ(SDNode *Node, SelectionDAG &DAG) const {
SDValue TargetLowering::expandCTLS(SDNode *Node, SelectionDAG &DAG) const {
SDLoc dl(Node);
EVT VT = Node->getValueType(0);
- EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
SDValue Op = Node->getOperand(0);
unsigned NumBitsPerElt = VT.getScalarSizeInBits();
// CTLS(x) = CTLZ(OR(SHL(XOR(x, SRA(x, BW-1)), 1), 1))
// This transforms the sign bits into leading zeros that can be counted.
- SDValue ShiftAmt = DAG.getConstant(NumBitsPerElt - 1, dl, ShVT);
+ SDValue ShiftAmt = DAG.getShiftAmountConstant(NumBitsPerElt - 1, VT, dl);
+ SDValue OneShift = DAG.getShiftAmountConstant(1, VT, dl);
SDValue One = DAG.getConstant(1, dl, VT);
SDValue SignBit = DAG.getNode(ISD::SRA, dl, VT, Op, ShiftAmt);
SDValue Xor = DAG.getNode(ISD::XOR, dl, VT, Op, SignBit);
- SDValue Shl = DAG.getNode(ISD::SHL, dl, VT, Xor, One);
+ SDValue Shl = DAG.getNode(ISD::SHL, dl, VT, Xor, OneShift);
SDValue Or = DAG.getNode(ISD::OR, dl, VT, Shl, One);
return DAG.getNode(ISD::CTLZ, dl, VT, Or);
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 7cf02c57ebde8..acd44a9c7e282 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -3840,19 +3840,9 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
}
case Intrinsic::arm_cls: {
- // ARM32 scalar CLS: CTLS(x) = CTLZ(OR(SHL(XOR(x, SRA(x, 31)), 1), 1))
- // We expand directly here instead of using ISD::CTLS since there's no
- // native scalar CLS instruction on ARM.
const SDValue &Operand = Op.getOperand(1);
const EVT VTy = Op.getValueType();
- SDValue SRA =
- DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
- SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
- SDValue SHL =
- DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
- SDValue OR =
- DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
- return DAG.getNode(ISD::CTLZ, dl, VTy, OR);
+ return DAG.getNode(ISD::CTLS, dl, VTy, Operand);
}
case Intrinsic::arm_cls64: {
// arm_cls64 returns i32 but takes i64 input.
>From a19c60dde6bd1456d3dbea886654acf62adcd72e Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Mon, 2 Feb 2026 16:01:23 +0200
Subject: [PATCH 07/18] [DAG] Add CTLS integer expansion support
- Fix indentation of ISD::CTLS case in ExpandIntRes to match CTLZ style
- Add DAG.getFreeze() in expandCTLS to preserve operand semantics when increased uses
This enables proper expansion of CTLS operations for wider integer types.
---
llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 4 +---
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 2 +-
2 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index ea219b839e476..734ce5614c4e7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -3079,9 +3079,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
case ISD::ABDU: ExpandIntRes_ABD(N, Lo, Hi); break;
case ISD::CTLZ_ZERO_UNDEF:
case ISD::CTLZ: ExpandIntRes_CTLZ(N, Lo, Hi); break;
- case ISD::CTLS:
- ExpandIntRes_CTLS(N, Lo, Hi);
- break;
+ case ISD::CTLS: ExpandIntRes_CTLS(N, Lo, Hi); break;
case ISD::CTPOP: ExpandIntRes_CTPOP(N, Lo, Hi); break;
case ISD::CTTZ_ZERO_UNDEF:
case ISD::CTTZ: ExpandIntRes_CTTZ(N, Lo, Hi); break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4d5fe5df7bb08..827d694f0345d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9581,7 +9581,7 @@ SDValue TargetLowering::expandCTLZ(SDNode *Node, SelectionDAG &DAG) const {
SDValue TargetLowering::expandCTLS(SDNode *Node, SelectionDAG &DAG) const {
SDLoc dl(Node);
EVT VT = Node->getValueType(0);
- SDValue Op = Node->getOperand(0);
+ SDValue Op = DAG.getFreeze(Node->getOperand(0));
unsigned NumBitsPerElt = VT.getScalarSizeInBits();
// CTLS(x) = CTLZ(OR(SHL(XOR(x, SRA(x, BW-1)), 1), 1))
>From 44dc0ac609080bbdf436036df7a646569af8336e Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Mon, 2 Feb 2026 16:01:32 +0200
Subject: [PATCH 08/18] [ARM] Use ISD::CTLS for scalar CLS intrinsics
- Lower arm_cls and arm_cls64 intrinsics to ISD::CTLS
- Lower vector arm_neon_vcls and arm_mve_vcls intrinsics to ISD::CTLS
- Inline temporary Operand variables for cleaner code
- Move explanatory comment to the arm_cls intrinsic lowering
- Remove CTTZ_ZERO_UNDEF custom lowering (now handled by generic DAG expansion)
This unifies CLS handling under the generic ISD::CTLS infrastructure
for better maintainability and consistency. Addresses issue #174337.
---
llvm/lib/Target/ARM/ARMISelLowering.cpp | 16 ++++++----------
1 file changed, 6 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index acd44a9c7e282..48fb124decbda 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1005,9 +1005,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
setOperationAction(ISD::ROTR, VT, Expand);
}
setOperationAction(ISD::CTTZ, MVT::i32, Custom);
- // Note: arm_cls and arm_cls64 intrinsics are expanded directly in
- // LowerINTRINSIC_WO_CHAIN since there's no native scalar CLS instruction.
- // Vector CTLS is Legal when NEON/MVE is available (set elsewhere).
// TODO: These two should be set to LibCall, but this currently breaks
// the Linux kernel build. See #101786.
setOperationAction(ISD::CTPOP, MVT::i32, Expand);
@@ -3840,6 +3837,9 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
}
case Intrinsic::arm_cls: {
+ // Note: arm_cls and arm_cls64 intrinsics are expanded directly here
+ // in LowerINTRINSIC_WO_CHAIN since there's no native scalar CLS instruction.
+ // Vector CTLS is Legal when NEON/MVE is available (set elsewhere).
const SDValue &Operand = Op.getOperand(1);
const EVT VTy = Op.getValueType();
return DAG.getNode(ISD::CTLS, dl, VTy, Operand);
@@ -3847,16 +3847,14 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
case Intrinsic::arm_cls64: {
// arm_cls64 returns i32 but takes i64 input.
// Use ISD::CTLS for i64 and truncate the result.
- const SDValue &Operand = Op.getOperand(1);
- SDValue CTLS64 = DAG.getNode(ISD::CTLS, dl, MVT::i64, Operand);
+ SDValue CTLS64 = DAG.getNode(ISD::CTLS, dl, MVT::i64, Op.getOperand(1));
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, CTLS64);
}
case Intrinsic::arm_neon_vcls:
case Intrinsic::arm_mve_vcls: {
// Lower vector CLS intrinsics to ISD::CTLS
- const SDValue &Operand = Op.getOperand(1);
const EVT VTy = Op.getValueType();
- return DAG.getNode(ISD::CTLS, dl, VTy, Operand);
+ return DAG.getNode(ISD::CTLS, dl, VTy, Op.getOperand(1));
}
case Intrinsic::eh_sjlj_lsda: {
MachineFunction &MF = DAG.getMachineFunction();
@@ -10333,9 +10331,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
case ISD::SRL_PARTS:
case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
- case ISD::CTTZ:
- case ISD::CTTZ_ZERO_UNDEF:
- return LowerCTTZ(Op.getNode(), DAG, Subtarget);
+ case ISD::CTTZ: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
>From a72dd367616517591eee64502b8182ae5d978bb2 Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Mon, 2 Feb 2026 16:01:38 +0200
Subject: [PATCH 09/18] [Test] Remove redundant CLS intrinsic tests
MVE CLS intrinsics are already tested in:
llvm/test/CodeGen/Thumb2/mve-intrinsics/vcls.ll
NEON CLS intrinsics are already tested in:
llvm/test/CodeGen/ARM/vcnt.ll
These new test files were redundant with existing coverage.
---
llvm/test/CodeGen/ARM/mve-cls.ll | 48 ------------------
llvm/test/CodeGen/ARM/neon-cls.ll | 84 -------------------------------
2 files changed, 132 deletions(-)
delete mode 100644 llvm/test/CodeGen/ARM/mve-cls.ll
delete mode 100644 llvm/test/CodeGen/ARM/neon-cls.ll
diff --git a/llvm/test/CodeGen/ARM/mve-cls.ll b/llvm/test/CodeGen/ARM/mve-cls.ll
deleted file mode 100644
index f4cb5bfa9d7f6..0000000000000
--- a/llvm/test/CodeGen/ARM/mve-cls.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m85 -mattr=+mve %s -o - | FileCheck %s
-
-; Test MVE vector CLS (Count Leading Sign bits) operations
-; The intrinsics are lowered to ISD::CTLS and selected to VCLS instructions
-
-define <16 x i8> @test_cls_v16i8(<16 x i8> %a) nounwind {
-; CHECK-LABEL: test_cls_v16i8:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d1, r2, r3
-; CHECK-NEXT: vmov d0, r0, r1
-; CHECK-NEXT: vcls.s8 q0, q0
-; CHECK-NEXT: vmov r0, r1, d0
-; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: bx lr
- %result = call <16 x i8> @llvm.arm.mve.vcls.v16i8(<16 x i8> %a)
- ret <16 x i8> %result
-}
-
-define <8 x i16> @test_cls_v8i16(<8 x i16> %a) nounwind {
-; CHECK-LABEL: test_cls_v8i16:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d1, r2, r3
-; CHECK-NEXT: vmov d0, r0, r1
-; CHECK-NEXT: vcls.s16 q0, q0
-; CHECK-NEXT: vmov r0, r1, d0
-; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: bx lr
- %result = call <8 x i16> @llvm.arm.mve.vcls.v8i16(<8 x i16> %a)
- ret <8 x i16> %result
-}
-
-define <4 x i32> @test_cls_v4i32(<4 x i32> %a) nounwind {
-; CHECK-LABEL: test_cls_v4i32:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d1, r2, r3
-; CHECK-NEXT: vmov d0, r0, r1
-; CHECK-NEXT: vcls.s32 q0, q0
-; CHECK-NEXT: vmov r0, r1, d0
-; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: bx lr
- %result = call <4 x i32> @llvm.arm.mve.vcls.v4i32(<4 x i32> %a)
- ret <4 x i32> %result
-}
-
-declare <16 x i8> @llvm.arm.mve.vcls.v16i8(<16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm.mve.vcls.v8i16(<8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm.mve.vcls.v4i32(<4 x i32>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM/neon-cls.ll b/llvm/test/CodeGen/ARM/neon-cls.ll
deleted file mode 100644
index 8113274440d86..0000000000000
--- a/llvm/test/CodeGen/ARM/neon-cls.ll
+++ /dev/null
@@ -1,84 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
-
-; Test ARM NEON vector CLS (Count Leading Sign bits) operations
-; The intrinsics are lowered to ISD::CTLS and selected to VCLS instructions
-
-define <8 x i8> @test_cls_v8i8(<8 x i8> %a) nounwind {
-; CHECK-LABEL: test_cls_v8i8:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d16, r0, r1
-; CHECK-NEXT: vcls.s8 d16, d16
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: mov pc, lr
- %result = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a)
- ret <8 x i8> %result
-}
-
-define <16 x i8> @test_cls_v16i8(<16 x i8> %a) nounwind {
-; CHECK-LABEL: test_cls_v16i8:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d17, r2, r3
-; CHECK-NEXT: vmov d16, r0, r1
-; CHECK-NEXT: vcls.s8 q8, q8
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: vmov r2, r3, d17
-; CHECK-NEXT: mov pc, lr
- %result = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a)
- ret <16 x i8> %result
-}
-
-define <4 x i16> @test_cls_v4i16(<4 x i16> %a) nounwind {
-; CHECK-LABEL: test_cls_v4i16:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d16, r0, r1
-; CHECK-NEXT: vcls.s16 d16, d16
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: mov pc, lr
- %result = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a)
- ret <4 x i16> %result
-}
-
-define <8 x i16> @test_cls_v8i16(<8 x i16> %a) nounwind {
-; CHECK-LABEL: test_cls_v8i16:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d17, r2, r3
-; CHECK-NEXT: vmov d16, r0, r1
-; CHECK-NEXT: vcls.s16 q8, q8
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: vmov r2, r3, d17
-; CHECK-NEXT: mov pc, lr
- %result = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a)
- ret <8 x i16> %result
-}
-
-define <2 x i32> @test_cls_v2i32(<2 x i32> %a) nounwind {
-; CHECK-LABEL: test_cls_v2i32:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d16, r0, r1
-; CHECK-NEXT: vcls.s32 d16, d16
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: mov pc, lr
- %result = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a)
- ret <2 x i32> %result
-}
-
-define <4 x i32> @test_cls_v4i32(<4 x i32> %a) nounwind {
-; CHECK-LABEL: test_cls_v4i32:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d17, r2, r3
-; CHECK-NEXT: vmov d16, r0, r1
-; CHECK-NEXT: vcls.s32 q8, q8
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: vmov r2, r3, d17
-; CHECK-NEXT: mov pc, lr
- %result = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a)
- ret <4 x i32> %result
-}
-
-declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone
>From 0b957d3741b86fec1f1cf11263a40c46002b87a6 Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Mon, 2 Feb 2026 16:16:21 +0200
Subject: [PATCH 10/18] [NFC] Fix clang-format issues
---
llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 4 +++-
llvm/lib/Target/ARM/ARMISelLowering.cpp | 8 +++++---
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 734ce5614c4e7..ea219b839e476 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -3079,7 +3079,9 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
case ISD::ABDU: ExpandIntRes_ABD(N, Lo, Hi); break;
case ISD::CTLZ_ZERO_UNDEF:
case ISD::CTLZ: ExpandIntRes_CTLZ(N, Lo, Hi); break;
- case ISD::CTLS: ExpandIntRes_CTLS(N, Lo, Hi); break;
+ case ISD::CTLS:
+ ExpandIntRes_CTLS(N, Lo, Hi);
+ break;
case ISD::CTPOP: ExpandIntRes_CTPOP(N, Lo, Hi); break;
case ISD::CTTZ_ZERO_UNDEF:
case ISD::CTTZ: ExpandIntRes_CTTZ(N, Lo, Hi); break;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 48fb124decbda..85dcb0d542e9d 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -3838,8 +3838,9 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
}
case Intrinsic::arm_cls: {
// Note: arm_cls and arm_cls64 intrinsics are expanded directly here
- // in LowerINTRINSIC_WO_CHAIN since there's no native scalar CLS instruction.
- // Vector CTLS is Legal when NEON/MVE is available (set elsewhere).
+ // in LowerINTRINSIC_WO_CHAIN since there's no native scalar CLS
+ // instruction. Vector CTLS is Legal when NEON/MVE is available (set
+ // elsewhere).
const SDValue &Operand = Op.getOperand(1);
const EVT VTy = Op.getValueType();
return DAG.getNode(ISD::CTLS, dl, VTy, Operand);
@@ -10331,7 +10332,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
case ISD::SRL_PARTS:
case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
- case ISD::CTTZ: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
+ case ISD::CTTZ:
+ return LowerCTTZ(Op.getNode(), DAG, Subtarget);
case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
>From a606942aebb2f741abd9ab901063e44aed01fb44 Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Mon, 2 Feb 2026 16:55:36 +0200
Subject: [PATCH 11/18] [ARM] Fix CTTZ_ZERO_UNDEF custom lowering
---
llvm/lib/Target/ARM/ARMISelLowering.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 85dcb0d542e9d..903c3c84670af 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -10333,6 +10333,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRL_PARTS:
case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
case ISD::CTTZ:
+ case ISD::CTTZ_ZERO_UNDEF:
return LowerCTTZ(Op.getNode(), DAG, Subtarget);
case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
>From 4a5b1b121b80687e461b6aec43d76a7e23c13c31 Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Wed, 4 Feb 2026 05:10:05 +0200
Subject: [PATCH 12/18] [ARM] address reviwer commits
---
llvm/lib/Target/ARM/ARMISelLowering.cpp | 7 +-
llvm/test/CodeGen/ARM/cls.ll | 145 +++++++++++++++++++++---
2 files changed, 131 insertions(+), 21 deletions(-)
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 903c3c84670af..444448b140cc6 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -3839,8 +3839,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
case Intrinsic::arm_cls: {
// Note: arm_cls and arm_cls64 intrinsics are expanded directly here
// in LowerINTRINSIC_WO_CHAIN since there's no native scalar CLS
- // instruction. Vector CTLS is Legal when NEON/MVE is available (set
- // elsewhere).
+ // instruction.
const SDValue &Operand = Op.getOperand(1);
const EVT VTy = Op.getValueType();
return DAG.getNode(ISD::CTLS, dl, VTy, Operand);
@@ -3853,7 +3852,8 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
}
case Intrinsic::arm_neon_vcls:
case Intrinsic::arm_mve_vcls: {
- // Lower vector CLS intrinsics to ISD::CTLS
+ // Lower vector CLS intrinsics to ISD::CTLS.
+ // Vector CTLS is Legal when NEON/MVE is available (set elsewhere).
const EVT VTy = Op.getValueType();
return DAG.getNode(ISD::CTLS, dl, VTy, Op.getOperand(1));
}
@@ -10333,7 +10333,6 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRL_PARTS:
case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
case ISD::CTTZ:
- case ISD::CTTZ_ZERO_UNDEF:
return LowerCTTZ(Op.getNode(), DAG, Subtarget);
case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
diff --git a/llvm/test/CodeGen/ARM/cls.ll b/llvm/test/CodeGen/ARM/cls.ll
index ab0244a2aeb04..273460c1aa7a2 100644
--- a/llvm/test/CodeGen/ARM/cls.ll
+++ b/llvm/test/CodeGen/ARM/cls.ll
@@ -1,27 +1,138 @@
-; RUN: llc -mtriple=armv5 %s -o - | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=armv5 %s -o - | FileCheck %s --check-prefixes=CHECKV5
+; RUN: llc -mtriple=thumbv8.1-m.main %s -o - | FileCheck %s --check-prefix=CHECKV8
-; CHECK: eor [[T:r[0-9]+]], [[T]], [[T]], asr #31
-; CHECK-NEXT: mov [[C1:r[0-9]+]], #1
-; CHECK-NEXT: orr [[T]], [[C1]], [[T]], lsl #1
-; CHECK-NEXT: clz [[T]], [[T]]
define i32 @cls(i32 %t) {
+; CHECKV5-LABEL: cls:
+; CHECKV5: @ %bb.0:
+; CHECKV5-NEXT: eor r0, r0, r0, asr #31
+; CHECKV5-NEXT: mov r1, #1
+; CHECKV5-NEXT: orr r0, r1, r0, lsl #1
+; CHECKV5-NEXT: clz r0, r0
+; CHECKV5-NEXT: bx lr
+;
+; CHECKV8-LABEL: cls:
+; CHECKV8: @ %bb.0:
+; CHECKV8-NEXT: asrs r1, r0, #31
+; CHECKV8-NEXT: eors r1, r0
+; CHECKV8-NEXT: lsls r0, r1, #1
+; CHECKV8-NEXT: lsrs r1, r0, #1
+; CHECKV8-NEXT: orrs r1, r0
+; CHECKV8-NEXT: lsrs r0, r1, #2
+; CHECKV8-NEXT: orrs r0, r1
+; CHECKV8-NEXT: lsrs r1, r0, #4
+; CHECKV8-NEXT: orrs r1, r0
+; CHECKV8-NEXT: lsrs r0, r1, #8
+; CHECKV8-NEXT: orrs r0, r1
+; CHECKV8-NEXT: lsrs r1, r0, #16
+; CHECKV8-NEXT: orrs r1, r0
+; CHECKV8-NEXT: mvns r0, r1
+; CHECKV8-NEXT: movs r1, #1
+; CHECKV8-NEXT: lsrs r2, r0, #1
+; CHECKV8-NEXT: bics r0, r1
+; CHECKV8-NEXT: ldr r1, .LCPI0_0
+; CHECKV8-NEXT: ands r1, r2
+; CHECKV8-NEXT: subs r0, r0, r1
+; CHECKV8-NEXT: ldr r1, .LCPI0_1
+; CHECKV8-NEXT: lsrs r2, r0, #2
+; CHECKV8-NEXT: ands r0, r1
+; CHECKV8-NEXT: ands r2, r1
+; CHECKV8-NEXT: adds r0, r0, r2
+; CHECKV8-NEXT: lsrs r1, r0, #4
+; CHECKV8-NEXT: adds r0, r0, r1
+; CHECKV8-NEXT: ldr r1, .LCPI0_2
+; CHECKV8-NEXT: ands r1, r0
+; CHECKV8-NEXT: ldr r0, .LCPI0_3
+; CHECKV8-NEXT: muls r0, r1, r0
+; CHECKV8-NEXT: lsrs r0, r0, #24
+; CHECKV8-NEXT: bx lr
+; CHECKV8-NEXT: .p2align 2
+; CHECKV8-NEXT: @ %bb.1:
+; CHECKV8-NEXT: .LCPI0_0:
+; CHECKV8-NEXT: .long 1431655765 @ 0x55555555
+; CHECKV8-NEXT: .LCPI0_1:
+; CHECKV8-NEXT: .long 858993459 @ 0x33333333
+; CHECKV8-NEXT: .LCPI0_2:
+; CHECKV8-NEXT: .long 252645135 @ 0xf0f0f0f
+; CHECKV8-NEXT: .LCPI0_3:
+; CHECKV8-NEXT: .long 16843009 @ 0x1010101
%cls.i = call i32 @llvm.arm.cls(i32 %t)
ret i32 %cls.i
}
-; CHECK: cmp r1, #0
-; CHECK: mvnmi [[ADJUSTEDLO:r[0-9]+]], r0
-; CHECK: clz [[CLZLO:r[0-9]+]], [[ADJUSTEDLO]]
-; CHECK: eor [[A:r[0-9]+]], r1, r1, asr #31
-; CHECK: mov [[TMP:r[0-9]+]], #1
-; CHECK: orr [[A]], [[TMP]], [[A]], lsl #1
-; CHECK: clz [[CLSHI:r[0-9]+]], [[A]]
-; CHECK: cmp [[CLSHI]], #31
-; CHECK: addeq r0, [[CLZLO]], #31
define i32 @cls64(i64 %t) {
+; CHECKV5-LABEL: cls64:
+; CHECKV5: @ %bb.0:
+; CHECKV5-NEXT: cmp r1, #0
+; CHECKV5-NEXT: mvnne r0, r0
+; CHECKV5-NEXT: clz r2, r0
+; CHECKV5-NEXT: eor r0, r1, r1, asr #31
+; CHECKV5-NEXT: mov r1, #1
+; CHECKV5-NEXT: orr r0, r1, r0, lsl #1
+; CHECKV5-NEXT: clz r0, r0
+; CHECKV5-NEXT: cmp r0, #31
+; CHECKV5-NEXT: addeq r0, r2, #31
+; CHECKV5-NEXT: bx lr
+;
+; CHECKV8-LABEL: cls64:
+; CHECKV8: @ %bb.0:
+; CHECKV8-NEXT: push {r4, lr}
+; CHECKV8-NEXT: movs r4, r0
+; CHECKV8-NEXT: cmp r1, #0
+; CHECKV8-NEXT: beq .LBB1_2
+; CHECKV8-NEXT: @ %bb.1:
+; CHECKV8-NEXT: mvns r4, r4
+; CHECKV8-NEXT: .LBB1_2:
+; CHECKV8-NEXT: asrs r0, r1, #31
+; CHECKV8-NEXT: eors r0, r1
+; CHECKV8-NEXT: lsls r0, r0, #1
+; CHECKV8-NEXT: adds r0, r0, #1
+; CHECKV8-NEXT: bl __clzsi2
+; CHECKV8-NEXT: cmp r0, #31
+; CHECKV8-NEXT: bne .LBB1_4
+; CHECKV8-NEXT: @ %bb.3:
+; CHECKV8-NEXT: lsrs r0, r4, #1
+; CHECKV8-NEXT: orrs r0, r4
+; CHECKV8-NEXT: lsrs r1, r0, #2
+; CHECKV8-NEXT: orrs r1, r0
+; CHECKV8-NEXT: lsrs r0, r1, #4
+; CHECKV8-NEXT: orrs r0, r1
+; CHECKV8-NEXT: lsrs r1, r0, #8
+; CHECKV8-NEXT: orrs r1, r0
+; CHECKV8-NEXT: lsrs r0, r1, #16
+; CHECKV8-NEXT: orrs r0, r1
+; CHECKV8-NEXT: mvns r0, r0
+; CHECKV8-NEXT: lsrs r1, r0, #1
+; CHECKV8-NEXT: ldr r2, .LCPI1_0
+; CHECKV8-NEXT: ands r2, r1
+; CHECKV8-NEXT: subs r0, r0, r2
+; CHECKV8-NEXT: ldr r1, .LCPI1_1
+; CHECKV8-NEXT: lsrs r2, r0, #2
+; CHECKV8-NEXT: ands r0, r1
+; CHECKV8-NEXT: ands r2, r1
+; CHECKV8-NEXT: adds r0, r0, r2
+; CHECKV8-NEXT: lsrs r1, r0, #4
+; CHECKV8-NEXT: adds r0, r0, r1
+; CHECKV8-NEXT: ldr r1, .LCPI1_2
+; CHECKV8-NEXT: ands r1, r0
+; CHECKV8-NEXT: ldr r0, .LCPI1_3
+; CHECKV8-NEXT: muls r0, r1, r0
+; CHECKV8-NEXT: lsrs r0, r0, #24
+; CHECKV8-NEXT: adds r0, #31
+; CHECKV8-NEXT: .LBB1_4:
+; CHECKV8-NEXT: pop {r4}
+; CHECKV8-NEXT: pop {r1}
+; CHECKV8-NEXT: bx r1
+; CHECKV8-NEXT: .p2align 2
+; CHECKV8-NEXT: @ %bb.5:
+; CHECKV8-NEXT: .LCPI1_0:
+; CHECKV8-NEXT: .long 1431655765 @ 0x55555555
+; CHECKV8-NEXT: .LCPI1_1:
+; CHECKV8-NEXT: .long 858993459 @ 0x33333333
+; CHECKV8-NEXT: .LCPI1_2:
+; CHECKV8-NEXT: .long 252645135 @ 0xf0f0f0f
+; CHECKV8-NEXT: .LCPI1_3:
+; CHECKV8-NEXT: .long 16843009 @ 0x1010101
%cls.i = call i32 @llvm.arm.cls64(i64 %t)
ret i32 %cls.i
}
-
-declare i32 @llvm.arm.cls(i32) nounwind
-declare i32 @llvm.arm.cls64(i64) nounwind
>From 594f273f36bac2fd2a34746854a55da0c2b762d6 Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Tue, 10 Feb 2026 15:16:55 +0200
Subject: [PATCH 13/18] [ARM] Fix custom lowering for CTLZ and CTTZ_ZERO_UNDEF
---
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 2 +-
llvm/lib/Target/ARM/ARMISelLowering.cpp | 1 +
llvm/test/CodeGen/ARM/cls.ll | 2 +-
3 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 827d694f0345d..fddf0ba4fd5ba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9593,7 +9593,7 @@ SDValue TargetLowering::expandCTLS(SDNode *Node, SelectionDAG &DAG) const {
SDValue Xor = DAG.getNode(ISD::XOR, dl, VT, Op, SignBit);
SDValue Shl = DAG.getNode(ISD::SHL, dl, VT, Xor, OneShift);
SDValue Or = DAG.getNode(ISD::OR, dl, VT, Shl, One);
- return DAG.getNode(ISD::CTLZ, dl, VT, Or);
+ return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, VT, Or);
}
SDValue TargetLowering::expandVPCTLZ(SDNode *Node, SelectionDAG &DAG) const {
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 444448b140cc6..3c36d4b52d8f7 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -10333,6 +10333,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRL_PARTS:
case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
case ISD::CTTZ:
+ case ISD::CTTZ_ZERO_UNDEF:
return LowerCTTZ(Op.getNode(), DAG, Subtarget);
case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
diff --git a/llvm/test/CodeGen/ARM/cls.ll b/llvm/test/CodeGen/ARM/cls.ll
index 273460c1aa7a2..808b01c701217 100644
--- a/llvm/test/CodeGen/ARM/cls.ll
+++ b/llvm/test/CodeGen/ARM/cls.ll
@@ -64,7 +64,7 @@ define i32 @cls64(i64 %t) {
; CHECKV5-LABEL: cls64:
; CHECKV5: @ %bb.0:
; CHECKV5-NEXT: cmp r1, #0
-; CHECKV5-NEXT: mvnne r0, r0
+; CHECKV5-NEXT: mvnmi r0, r0
; CHECKV5-NEXT: clz r2, r0
; CHECKV5-NEXT: eor r0, r1, r1, asr #31
; CHECKV5-NEXT: mov r1, #1
>From 7fb35dd79e6e1aaa8f4c07ac002c1ff2578daa1a Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Tue, 10 Feb 2026 16:04:53 +0200
Subject: [PATCH 14/18] Updated CHECKV8 to match the prologue and branch
condition
---
llvm/test/CodeGen/ARM/cls.ll | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/ARM/cls.ll b/llvm/test/CodeGen/ARM/cls.ll
index 808b01c701217..3c3e2bd61c1fc 100644
--- a/llvm/test/CodeGen/ARM/cls.ll
+++ b/llvm/test/CodeGen/ARM/cls.ll
@@ -13,6 +13,7 @@ define i32 @cls(i32 %t) {
;
; CHECKV8-LABEL: cls:
; CHECKV8: @ %bb.0:
+; CHECKV8-NEXT: push {r7, lr}
; CHECKV8-NEXT: asrs r1, r0, #31
; CHECKV8-NEXT: eors r1, r0
; CHECKV8-NEXT: lsls r0, r1, #1
@@ -79,7 +80,7 @@ define i32 @cls64(i64 %t) {
; CHECKV8-NEXT: push {r4, lr}
; CHECKV8-NEXT: movs r4, r0
; CHECKV8-NEXT: cmp r1, #0
-; CHECKV8-NEXT: beq .LBB1_2
+; CHECKV8-NEXT: bpl .LBB1_2
; CHECKV8-NEXT: @ %bb.1:
; CHECKV8-NEXT: mvns r4, r4
; CHECKV8-NEXT: .LBB1_2:
>From a681a6b23a0f2aa0a1f65d42e640c07f9f7439f6 Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Tue, 10 Feb 2026 16:43:49 +0200
Subject: [PATCH 15/18] [ARM] Update cls.ll checks for v8.1-m
---
llvm/test/CodeGen/ARM/cls.ll | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/ARM/cls.ll b/llvm/test/CodeGen/ARM/cls.ll
index 3c3e2bd61c1fc..808b01c701217 100644
--- a/llvm/test/CodeGen/ARM/cls.ll
+++ b/llvm/test/CodeGen/ARM/cls.ll
@@ -13,7 +13,6 @@ define i32 @cls(i32 %t) {
;
; CHECKV8-LABEL: cls:
; CHECKV8: @ %bb.0:
-; CHECKV8-NEXT: push {r7, lr}
; CHECKV8-NEXT: asrs r1, r0, #31
; CHECKV8-NEXT: eors r1, r0
; CHECKV8-NEXT: lsls r0, r1, #1
@@ -80,7 +79,7 @@ define i32 @cls64(i64 %t) {
; CHECKV8-NEXT: push {r4, lr}
; CHECKV8-NEXT: movs r4, r0
; CHECKV8-NEXT: cmp r1, #0
-; CHECKV8-NEXT: bpl .LBB1_2
+; CHECKV8-NEXT: beq .LBB1_2
; CHECKV8-NEXT: @ %bb.1:
; CHECKV8-NEXT: mvns r4, r4
; CHECKV8-NEXT: .LBB1_2:
>From 6a9d40652a6982d6f0529aa66e32d89644f55938 Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Tue, 10 Feb 2026 16:43:59 +0200
Subject: [PATCH 16/18] [ARM] Expand DYNAMIC_STACKALLOC on non-Windows
---
llvm/lib/Target/ARM/ARMISelLowering.cpp | 35 ++++++++++++++++++++++++-
1 file changed, 34 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 3c36d4b52d8f7..eef5c12722f46 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -20496,9 +20496,42 @@ SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
SDValue
ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
- assert(Subtarget->isTargetWindows() && "unsupported target platform");
SDLoc DL(Op);
+ if (!Subtarget->isTargetWindows()) {
+ Register SPReg = getStackPointerRegisterToSaveRestore();
+ assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
+ " not tell us which reg is the stack pointer!");
+ EVT VT = Op.getValueType();
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+ SDValue AlignOp = Op.getOperand(2);
+
+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
+
+ SDValue SP = DAG.getCopyFromReg(Chain, DL, SPReg, VT);
+ Chain = SP.getValue(1);
+
+ Align Alignment = cast<ConstantSDNode>(AlignOp)->getAlignValue();
+ const TargetFrameLowering *TFL = DAG.getSubtarget().getFrameLowering();
+ unsigned Opc = TFL->getStackGrowthDirection() ==
+ TargetFrameLowering::StackGrowsUp
+ ? ISD::ADD
+ : ISD::SUB;
+
+ Align StackAlign = TFL->getStackAlign();
+ SDValue NewSP = DAG.getNode(Opc, DL, VT, SP, Size);
+ if (Alignment > StackAlign)
+ NewSP = DAG.getNode(ISD::AND, DL, VT, NewSP,
+ DAG.getSignedConstant(-Alignment.value(), DL, VT));
+
+ Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP);
+ SDValue End = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), DL);
+
+ SDValue Ops[2] = {NewSP, End};
+ return DAG.getMergeValues(Ops, DL);
+ }
+
// Get the inputs.
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
>From 459fcf1e42e3ed02ad2820bb0a4b828d1b7f9f17 Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Tue, 10 Feb 2026 20:01:13 +0200
Subject: [PATCH 17/18] ARM: restore Windows-only DYNAMIC_STACKALLOC
---
llvm/lib/Target/ARM/ARMISelLowering.cpp | 35 +------------------------
1 file changed, 1 insertion(+), 34 deletions(-)
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index eef5c12722f46..3c36d4b52d8f7 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -20496,42 +20496,9 @@ SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
SDValue
ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
+ assert(Subtarget->isTargetWindows() && "unsupported target platform");
SDLoc DL(Op);
- if (!Subtarget->isTargetWindows()) {
- Register SPReg = getStackPointerRegisterToSaveRestore();
- assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
- " not tell us which reg is the stack pointer!");
- EVT VT = Op.getValueType();
- SDValue Chain = Op.getOperand(0);
- SDValue Size = Op.getOperand(1);
- SDValue AlignOp = Op.getOperand(2);
-
- Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
-
- SDValue SP = DAG.getCopyFromReg(Chain, DL, SPReg, VT);
- Chain = SP.getValue(1);
-
- Align Alignment = cast<ConstantSDNode>(AlignOp)->getAlignValue();
- const TargetFrameLowering *TFL = DAG.getSubtarget().getFrameLowering();
- unsigned Opc = TFL->getStackGrowthDirection() ==
- TargetFrameLowering::StackGrowsUp
- ? ISD::ADD
- : ISD::SUB;
-
- Align StackAlign = TFL->getStackAlign();
- SDValue NewSP = DAG.getNode(Opc, DL, VT, SP, Size);
- if (Alignment > StackAlign)
- NewSP = DAG.getNode(ISD::AND, DL, VT, NewSP,
- DAG.getSignedConstant(-Alignment.value(), DL, VT));
-
- Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP);
- SDValue End = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), DL);
-
- SDValue Ops[2] = {NewSP, End};
- return DAG.getMergeValues(Ops, DL);
- }
-
// Get the inputs.
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
>From 59fedf6ae491ae77b7f628162bfc9a89f519486d Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Tue, 10 Feb 2026 20:03:29 +0200
Subject: [PATCH 18/18] ARM: restore CTTZ case formatting
---
llvm/lib/Target/ARM/ARMISelLowering.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 3c36d4b52d8f7..69753f83e2161 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -10333,8 +10333,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRL_PARTS:
case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
case ISD::CTTZ:
- case ISD::CTTZ_ZERO_UNDEF:
- return LowerCTTZ(Op.getNode(), DAG, Subtarget);
+ case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
More information about the llvm-commits
mailing list