[llvm] [ARM][AArch64] Replace manual CLS expansion with ISD::CTLS (PR #178430)
Hamza Hassanain via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 30 05:14:32 PST 2026
https://github.com/HamzaHassanain updated https://github.com/llvm/llvm-project/pull/178430
>From 1d7668737a5eaffcd3fcf1ee76a25f33887667c6 Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Wed, 28 Jan 2026 15:55:45 +0200
Subject: [PATCH 1/5] [ARM] Replace manual CLS expansion with ISD::CTLS
Converts ARM scalar CLS intrinsics to use the unified ISD::CTLS node
instead of custom manual expansion. This addresses issue #174337.
Changes:
- int_arm_cls and int_arm_cls64 now return ISD::CTLS nodes
- Added LowerCTLS to handle custom lowering for i32 and i64 CTLS operations
- i32 CTLS expansion generates the same CTLZ-based pattern as before
- i64 CTLS on 32-bit ARM splits into two 32-bit operations
- Added proper setOperationAction for CTLS operations
The assembly output remains identical to the previous manual expansion,
ensuring compatibility with existing code. This unifies the CLS handling
under the generic ISD::CTLS infrastructure for better maintainability.
---
llvm/lib/Target/ARM/ARMISelLowering.cpp | 95 ++++++++++++++++---------
llvm/lib/Target/ARM/ARMISelLowering.h | 2 +
llvm/test/CodeGen/ARM/mve-cls.ll | 45 ++++++++++++
llvm/test/CodeGen/ARM/neon-cls.ll | 81 +++++++++++++++++++++
4 files changed, 191 insertions(+), 32 deletions(-)
create mode 100644 llvm/test/CodeGen/ARM/mve-cls.ll
create mode 100644 llvm/test/CodeGen/ARM/neon-cls.ll
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 7b240462c66fb..aee3c50b4a476 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1004,6 +1004,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
setOperationAction(ISD::ROTR, VT, Expand);
}
setOperationAction(ISD::CTTZ, MVT::i32, Custom);
+ // CTLS (Count Leading Sign bits)
+ setOperationAction(ISD::CTLS, MVT::i32, Custom);
+ setOperationAction(ISD::CTLS, MVT::i64, Custom);
// TODO: These two should be set to LibCall, but this currently breaks
// the Linux kernel build. See #101786.
setOperationAction(ISD::CTPOP, MVT::i32, Expand);
@@ -3838,42 +3841,12 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
case Intrinsic::arm_cls: {
const SDValue &Operand = Op.getOperand(1);
const EVT VTy = Op.getValueType();
- SDValue SRA =
- DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
- SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
- SDValue SHL =
- DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
- SDValue OR =
- DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
- SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR);
- return Result;
+ return DAG.getNode(ISD::CTLS, dl, VTy, Operand);
}
case Intrinsic::arm_cls64: {
- // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
- // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
const SDValue &Operand = Op.getOperand(1);
const EVT VTy = Op.getValueType();
- SDValue Lo, Hi;
- std::tie(Lo, Hi) = DAG.SplitScalar(Operand, dl, VTy, VTy);
- SDValue Constant0 = DAG.getConstant(0, dl, VTy);
- SDValue Constant1 = DAG.getConstant(1, dl, VTy);
- SDValue Constant31 = DAG.getConstant(31, dl, VTy);
- SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31);
- SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi);
- SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1);
- SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1);
- SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi);
- SDValue CheckLo =
- DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ);
- SDValue HiIsZero =
- DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ);
- SDValue AdjustedLo =
- DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy));
- SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo);
- SDValue Result =
- DAG.getSelect(dl, VTy, CheckLo,
- DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi);
- return Result;
+ return DAG.getNode(ISD::CTLS, dl, VTy, Operand);
}
case Intrinsic::eh_sjlj_lsda: {
MachineFunction &MF = DAG.getMachineFunction();
@@ -6309,6 +6282,63 @@ static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
return Res;
}
+static SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ SDValue Operand = N->getOperand(0);
+
+ if (VT == MVT::i32) {
+ // ARM32 scalar CLS: CTLS(x) = CTLZ(OR(SHL(XOR(x, SRA(x, 31)), 1), 1))
+ SDValue SRA =
+ DAG.getNode(ISD::SRA, dl, VT, Operand, DAG.getConstant(31, dl, VT));
+ SDValue XOR = DAG.getNode(ISD::XOR, dl, VT, SRA, Operand);
+ SDValue SHL =
+ DAG.getNode(ISD::SHL, dl, VT, XOR, DAG.getConstant(1, dl, VT));
+ SDValue OR =
+ DAG.getNode(ISD::OR, dl, VT, SHL, DAG.getConstant(1, dl, VT));
+ SDValue Result = DAG.getNode(ISD::CTLZ, dl, VT, OR);
+ return Result;
+ }
+
+ if (VT == MVT::i64) {
+ // For 64-bit on 32-bit ARM, we need to split into two 32-bit operations
+ EVT VT32 = MVT::i32;
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitScalar(Operand, dl, VT32, VT32);
+
+ SDValue Constant0 = DAG.getConstant(0, dl, VT32);
+ SDValue Constant1 = DAG.getConstant(1, dl, VT32);
+ SDValue Constant31 = DAG.getConstant(31, dl, VT32);
+
+ // Compute CTLS of high part
+ SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VT32, Hi, Constant31);
+ SDValue XORHi = DAG.getNode(ISD::XOR, dl, VT32, SRAHi, Hi);
+ SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VT32, XORHi, Constant1);
+ SDValue ORHi = DAG.getNode(ISD::OR, dl, VT32, SHLHi, Constant1);
+ SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VT32, ORHi);
+
+ // Check if CLSHi == 31 (all high bits are sign bits)
+ SDValue IsAllSignBits =
+ DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::SETEQ);
+
+ // If all high bits are sign bits, compute for low part
+ SDValue HiIsZero =
+ DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::SETEQ);
+ SDValue AdjustedLo =
+ DAG.getSelect(dl, VT32, HiIsZero, Lo, DAG.getNOT(dl, Lo, VT32));
+ SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VT32, AdjustedLo);
+ SDValue Result =
+ DAG.getSelect(dl, VT32, IsAllSignBits,
+ DAG.getNode(ISD::ADD, dl, VT32, CLZAdjustedLo, Constant31), CLSHi);
+
+ return Result;
+ }
+
+ // Vector types should be handled elsewhere
+ return SDValue();
+}
+
/// Getvshiftimm - Check if this is a valid build_vector for the immediate
/// operand of a vector shift operation, where all the elements of the
/// build_vector must have the same constant integer value.
@@ -10352,6 +10382,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
+ case ISD::CTLS: return LowerCTLS(Op.getNode(), DAG, Subtarget);
case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index bc84654f8bd5a..b6dc9851b1d56 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -608,6 +608,8 @@ class VectorType;
SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCMP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *ST) const;
Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
diff --git a/llvm/test/CodeGen/ARM/mve-cls.ll b/llvm/test/CodeGen/ARM/mve-cls.ll
new file mode 100644
index 0000000000000..cbf708f637992
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/mve-cls.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m85 -mattr=+mve %s -o - | FileCheck %s
+
+define <16 x i8> @test_cls_v16s8(<16 x i8> %a) nounwind {
+; CHECK-LABEL: test_cls_v16s8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d1, r2, r3
+; CHECK-NEXT: vmov d0, r0, r1
+; CHECK-NEXT: vcls.s8 q0, q0
+; CHECK-NEXT: vmov r0, r1, d0
+; CHECK-NEXT: vmov r2, r3, d1
+; CHECK-NEXT: bx lr
+ %result = call <16 x i8> @llvm.arm.mve.vcls.v16s8(<16 x i8> %a)
+ ret <16 x i8> %result
+}
+
+define <8 x i16> @test_cls_v8s16(<8 x i16> %a) nounwind {
+; CHECK-LABEL: test_cls_v8s16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d1, r2, r3
+; CHECK-NEXT: vmov d0, r0, r1
+; CHECK-NEXT: vcls.s16 q0, q0
+; CHECK-NEXT: vmov r0, r1, d0
+; CHECK-NEXT: vmov r2, r3, d1
+; CHECK-NEXT: bx lr
+ %result = call <8 x i16> @llvm.arm.mve.vcls.v8s16(<8 x i16> %a)
+ ret <8 x i16> %result
+}
+
+define <4 x i32> @test_cls_v4s32(<4 x i32> %a) nounwind {
+; CHECK-LABEL: test_cls_v4s32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d1, r2, r3
+; CHECK-NEXT: vmov d0, r0, r1
+; CHECK-NEXT: vcls.s32 q0, q0
+; CHECK-NEXT: vmov r0, r1, d0
+; CHECK-NEXT: vmov r2, r3, d1
+; CHECK-NEXT: bx lr
+ %result = call <4 x i32> @llvm.arm.mve.vcls.v4s32(<4 x i32> %a)
+ ret <4 x i32> %result
+}
+
+declare <16 x i8> @llvm.arm.mve.vcls.v16s8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.mve.vcls.v8s16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.mve.vcls.v4s32(<4 x i32>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM/neon-cls.ll b/llvm/test/CodeGen/ARM/neon-cls.ll
new file mode 100644
index 0000000000000..4667b87a6f7bd
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/neon-cls.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
+
+define <8 x i8> @test_cls_v8i8(<8 x i8> %a) nounwind {
+; CHECK-LABEL: test_cls_v8i8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s8 d16, d16
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: mov pc, lr
+ %result = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a)
+ ret <8 x i8> %result
+}
+
+define <16 x i8> @test_cls_v16i8(<16 x i8> %a) nounwind {
+; CHECK-LABEL: test_cls_v16i8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s8 q8, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: mov pc, lr
+ %result = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a)
+ ret <16 x i8> %result
+}
+
+define <4 x i16> @test_cls_v4i16(<4 x i16> %a) nounwind {
+; CHECK-LABEL: test_cls_v4i16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s16 d16, d16
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: mov pc, lr
+ %result = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a)
+ ret <4 x i16> %result
+}
+
+define <8 x i16> @test_cls_v8i16(<8 x i16> %a) nounwind {
+; CHECK-LABEL: test_cls_v8i16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s16 q8, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: mov pc, lr
+ %result = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a)
+ ret <8 x i16> %result
+}
+
+define <2 x i32> @test_cls_v2i32(<2 x i32> %a) nounwind {
+; CHECK-LABEL: test_cls_v2i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s32 d16, d16
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: mov pc, lr
+ %result = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a)
+ ret <2 x i32> %result
+}
+
+define <4 x i32> @test_cls_v4i32(<4 x i32> %a) nounwind {
+; CHECK-LABEL: test_cls_v4i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s32 q8, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: mov pc, lr
+ %result = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a)
+ ret <4 x i32> %result
+}
+
+declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone
>From c5b4e46db56cd62536475362b569b59bd8fa92b5 Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Wed, 28 Jan 2026 16:24:46 +0200
Subject: [PATCH 2/5] style: fix foramt errors
---
llvm/lib/Target/ARM/ARMISelLowering.cpp | 18 ++++++++----------
1 file changed, 8 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index aee3c50b4a476..c8356b386a68d 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -6282,8 +6282,7 @@ static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
return Res;
}
-static SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG,
- const ARMSubtarget *ST) {
+static SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
SDValue Operand = N->getOperand(0);
@@ -6295,8 +6294,7 @@ static SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG,
SDValue XOR = DAG.getNode(ISD::XOR, dl, VT, SRA, Operand);
SDValue SHL =
DAG.getNode(ISD::SHL, dl, VT, XOR, DAG.getConstant(1, dl, VT));
- SDValue OR =
- DAG.getNode(ISD::OR, dl, VT, SHL, DAG.getConstant(1, dl, VT));
+ SDValue OR = DAG.getNode(ISD::OR, dl, VT, SHL, DAG.getConstant(1, dl, VT));
SDValue Result = DAG.getNode(ISD::CTLZ, dl, VT, OR);
return Result;
}
@@ -6323,14 +6321,13 @@ static SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG,
DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::SETEQ);
// If all high bits are sign bits, compute for low part
- SDValue HiIsZero =
- DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::SETEQ);
+ SDValue HiIsZero = DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::SETEQ);
SDValue AdjustedLo =
DAG.getSelect(dl, VT32, HiIsZero, Lo, DAG.getNOT(dl, Lo, VT32));
SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VT32, AdjustedLo);
- SDValue Result =
- DAG.getSelect(dl, VT32, IsAllSignBits,
- DAG.getNode(ISD::ADD, dl, VT32, CLZAdjustedLo, Constant31), CLSHi);
+ SDValue Result = DAG.getSelect(
+ dl, VT32, IsAllSignBits,
+ DAG.getNode(ISD::ADD, dl, VT32, CLZAdjustedLo, Constant31), CLSHi);
return Result;
}
@@ -10382,7 +10379,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
- case ISD::CTLS: return LowerCTLS(Op.getNode(), DAG, Subtarget);
+ case ISD::CTLS:
+ return LowerCTLS(Op.getNode(), DAG, Subtarget);
case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
>From 3b4d53594195f82814e8aa6b999ef37e9d73912b Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Thu, 29 Jan 2026 14:47:37 +0200
Subject: [PATCH 3/5] Fix: Change LowerCTLS from static to member function
The LowerCTLS function was declared as a member function in the header
but defined as a static file-scope function in the cpp file. This caused
a compilation error (unused function warning treated as error) because
the compiler couldn't resolve the function call properly.
Changed the definition to match the header declaration as a const member
function of ARMTargetLowering class.
---
llvm/lib/Target/ARM/ARMISelLowering.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index c8356b386a68d..794a8973c25b7 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -6282,7 +6282,8 @@ static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
return Res;
}
-static SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) {
+SDValue ARMTargetLowering::LowerCTLS(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *ST) const {
SDLoc dl(N);
EVT VT = N->getValueType(0);
SDValue Operand = N->getOperand(0);
@@ -10379,8 +10380,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
- case ISD::CTLS:
- return LowerCTLS(Op.getNode(), DAG, Subtarget);
+ case ISD::CTLS: return LowerCTLS(Op.getNode(), DAG, Subtarget);
case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
>From 061b4afcad82d7ad5d5749401d612ea95a4ca90a Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Thu, 29 Jan 2026 15:07:04 +0200
Subject: [PATCH 4/5] Remove vector CLS tests - vector implementation is in a
separate commit
The vector CLS intrinsics conversion tests should only be included after
the vector support has been fully implemented with proper ISD::CTLS Legal
actions for NEON and MVE types. Those changes are in commit e09ae3fa5fa2.
---
llvm/test/CodeGen/ARM/mve-cls.ll | 45 -----------------
llvm/test/CodeGen/ARM/neon-cls.ll | 81 -------------------------------
2 files changed, 126 deletions(-)
delete mode 100644 llvm/test/CodeGen/ARM/mve-cls.ll
delete mode 100644 llvm/test/CodeGen/ARM/neon-cls.ll
diff --git a/llvm/test/CodeGen/ARM/mve-cls.ll b/llvm/test/CodeGen/ARM/mve-cls.ll
deleted file mode 100644
index cbf708f637992..0000000000000
--- a/llvm/test/CodeGen/ARM/mve-cls.ll
+++ /dev/null
@@ -1,45 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m85 -mattr=+mve %s -o - | FileCheck %s
-
-define <16 x i8> @test_cls_v16s8(<16 x i8> %a) nounwind {
-; CHECK-LABEL: test_cls_v16s8:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d1, r2, r3
-; CHECK-NEXT: vmov d0, r0, r1
-; CHECK-NEXT: vcls.s8 q0, q0
-; CHECK-NEXT: vmov r0, r1, d0
-; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: bx lr
- %result = call <16 x i8> @llvm.arm.mve.vcls.v16s8(<16 x i8> %a)
- ret <16 x i8> %result
-}
-
-define <8 x i16> @test_cls_v8s16(<8 x i16> %a) nounwind {
-; CHECK-LABEL: test_cls_v8s16:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d1, r2, r3
-; CHECK-NEXT: vmov d0, r0, r1
-; CHECK-NEXT: vcls.s16 q0, q0
-; CHECK-NEXT: vmov r0, r1, d0
-; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: bx lr
- %result = call <8 x i16> @llvm.arm.mve.vcls.v8s16(<8 x i16> %a)
- ret <8 x i16> %result
-}
-
-define <4 x i32> @test_cls_v4s32(<4 x i32> %a) nounwind {
-; CHECK-LABEL: test_cls_v4s32:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d1, r2, r3
-; CHECK-NEXT: vmov d0, r0, r1
-; CHECK-NEXT: vcls.s32 q0, q0
-; CHECK-NEXT: vmov r0, r1, d0
-; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: bx lr
- %result = call <4 x i32> @llvm.arm.mve.vcls.v4s32(<4 x i32> %a)
- ret <4 x i32> %result
-}
-
-declare <16 x i8> @llvm.arm.mve.vcls.v16s8(<16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm.mve.vcls.v8s16(<8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm.mve.vcls.v4s32(<4 x i32>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM/neon-cls.ll b/llvm/test/CodeGen/ARM/neon-cls.ll
deleted file mode 100644
index 4667b87a6f7bd..0000000000000
--- a/llvm/test/CodeGen/ARM/neon-cls.ll
+++ /dev/null
@@ -1,81 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
-
-define <8 x i8> @test_cls_v8i8(<8 x i8> %a) nounwind {
-; CHECK-LABEL: test_cls_v8i8:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d16, r0, r1
-; CHECK-NEXT: vcls.s8 d16, d16
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: mov pc, lr
- %result = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a)
- ret <8 x i8> %result
-}
-
-define <16 x i8> @test_cls_v16i8(<16 x i8> %a) nounwind {
-; CHECK-LABEL: test_cls_v16i8:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d17, r2, r3
-; CHECK-NEXT: vmov d16, r0, r1
-; CHECK-NEXT: vcls.s8 q8, q8
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: vmov r2, r3, d17
-; CHECK-NEXT: mov pc, lr
- %result = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a)
- ret <16 x i8> %result
-}
-
-define <4 x i16> @test_cls_v4i16(<4 x i16> %a) nounwind {
-; CHECK-LABEL: test_cls_v4i16:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d16, r0, r1
-; CHECK-NEXT: vcls.s16 d16, d16
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: mov pc, lr
- %result = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a)
- ret <4 x i16> %result
-}
-
-define <8 x i16> @test_cls_v8i16(<8 x i16> %a) nounwind {
-; CHECK-LABEL: test_cls_v8i16:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d17, r2, r3
-; CHECK-NEXT: vmov d16, r0, r1
-; CHECK-NEXT: vcls.s16 q8, q8
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: vmov r2, r3, d17
-; CHECK-NEXT: mov pc, lr
- %result = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a)
- ret <8 x i16> %result
-}
-
-define <2 x i32> @test_cls_v2i32(<2 x i32> %a) nounwind {
-; CHECK-LABEL: test_cls_v2i32:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d16, r0, r1
-; CHECK-NEXT: vcls.s32 d16, d16
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: mov pc, lr
- %result = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a)
- ret <2 x i32> %result
-}
-
-define <4 x i32> @test_cls_v4i32(<4 x i32> %a) nounwind {
-; CHECK-LABEL: test_cls_v4i32:
-; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d17, r2, r3
-; CHECK-NEXT: vmov d16, r0, r1
-; CHECK-NEXT: vcls.s32 q8, q8
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: vmov r2, r3, d17
-; CHECK-NEXT: mov pc, lr
- %result = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a)
- ret <4 x i32> %result
-}
-
-declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone
>From ecde030395dd116390bc72bda834671a89f1e400 Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Thu, 29 Jan 2026 19:36:45 +0200
Subject: [PATCH 5/5] [ARM] Convert CLS intrinsics to use ISD::CTLS
This patch converts ARM CLS intrinsics (arm_cls, arm_cls64, arm_neon_vcls,
arm_mve_vcls) to use the generic ISD::CTLS node.
- arm_cls: Expanded directly in LowerINTRINSIC_WO_CHAIN (no native scalar CLS)
- arm_cls64: Uses ISD::CTLS with TRUNCATE, relying on ExpandIntRes_CTLS
- arm_neon_vcls: Lowered to ISD::CTLS, pattern-matched to VCLS instruction
- arm_mve_vcls: Lowered to ISD::CTLS, pattern-matched to MVE VCLS instruction
Also adds generic CTLS expansion support:
- ExpandIntRes_CTLS in LegalizeIntegerTypes for i64->i32 type expansion
- expandCTLS in TargetLowering for targets without native CLS instruction
Part of: https://github.com/llvm/llvm-project/issues/174337
---
llvm/include/llvm/CodeGen/TargetLowering.h | 6 ++
llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 4 +
.../SelectionDAG/LegalizeIntegerTypes.cpp | 29 ++++++
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 +
.../CodeGen/SelectionDAG/TargetLowering.cpp | 18 ++++
llvm/lib/Target/ARM/ARMISelLowering.cpp | 89 ++++++-------------
llvm/lib/Target/ARM/ARMISelLowering.h | 2 -
llvm/lib/Target/ARM/ARMInstrMVE.td | 6 +-
llvm/lib/Target/ARM/ARMInstrNEON.td | 2 +-
llvm/test/CodeGen/ARM/cls.ll | 6 +-
llvm/test/CodeGen/ARM/mve-cls.ll | 48 ++++++++++
llvm/test/CodeGen/ARM/neon-cls.ll | 84 +++++++++++++++++
12 files changed, 224 insertions(+), 71 deletions(-)
create mode 100644 llvm/test/CodeGen/ARM/mve-cls.ll
create mode 100644 llvm/test/CodeGen/ARM/neon-cls.ll
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index df0fced112f73..f4010848aa8c4 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5583,6 +5583,12 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
/// \returns The expansion result or SDValue() if it fails.
SDValue expandCTLZ(SDNode *N, SelectionDAG &DAG) const;
+ /// Expand CTLS (count leading sign bits) nodes.
+ /// CTLS(x) = CTLZ(OR(SHL(XOR(x, SRA(x, BW-1)), 1), 1))
+ /// \param N Node to expand
+ /// \returns The expansion result or SDValue() if it fails.
+ SDValue expandCTLS(SDNode *N, SelectionDAG &DAG) const;
+
/// Expand VP_CTLZ/VP_CTLZ_ZERO_UNDEF nodes.
/// \param N Node to expand
/// \returns The expansion result or SDValue() if it fails.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index d9a2409b35e4c..df3ee52b42638 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3218,6 +3218,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
if ((Tmp1 = TLI.expandCTLZ(Node, DAG)))
Results.push_back(Tmp1);
break;
+ case ISD::CTLS:
+ if ((Tmp1 = TLI.expandCTLS(Node, DAG)))
+ Results.push_back(Tmp1);
+ break;
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF:
if ((Tmp1 = TLI.expandCTTZ(Node, DAG)))
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 8ce41df6be69b..586cb6878869d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -3066,6 +3066,9 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
case ISD::ABDU: ExpandIntRes_ABD(N, Lo, Hi); break;
case ISD::CTLZ_ZERO_UNDEF:
case ISD::CTLZ: ExpandIntRes_CTLZ(N, Lo, Hi); break;
+ case ISD::CTLS:
+ ExpandIntRes_CTLS(N, Lo, Hi);
+ break;
case ISD::CTPOP: ExpandIntRes_CTPOP(N, Lo, Hi); break;
case ISD::CTTZ_ZERO_UNDEF:
case ISD::CTTZ: ExpandIntRes_CTTZ(N, Lo, Hi); break;
@@ -4150,6 +4153,32 @@ void DAGTypeLegalizer::ExpandIntRes_CTLZ(SDNode *N,
Hi = DAG.getConstant(0, dl, NVT);
}
+void DAGTypeLegalizer::ExpandIntRes_CTLS(SDNode *N, SDValue &Lo, SDValue &Hi) {
+ SDLoc dl(N);
+ // ctls(HiLo) -> if (IsAllSignBits = (ctls(Hi) == BW-1)) then
+ // BW-1 + clz(IsNegative = (Hi < 0) ? ~Lo : Lo)
+ // else ctls(Hi)
+ GetExpandedInteger(N->getOperand(0), Lo, Hi);
+ EVT NVT = Lo.getValueType();
+ unsigned NVTBits = NVT.getSizeInBits();
+
+ SDValue Constant0 = DAG.getConstant(0, dl, NVT);
+ SDValue ConstantBWM1 = DAG.getConstant(NVTBits - 1, dl, NVT);
+
+ SDValue HiCTLS = DAG.getNode(ISD::CTLS, dl, NVT, Hi);
+ SDValue IsAllSignBits = DAG.getSetCC(dl, getSetCCResultType(NVT), HiCTLS,
+ ConstantBWM1, ISD::SETEQ);
+ SDValue IsNegative =
+ DAG.getSetCC(dl, getSetCCResultType(NVT), Hi, Constant0, ISD::SETLT);
+ SDValue AdjustedLo =
+ DAG.getSelect(dl, NVT, IsNegative, DAG.getNOT(dl, Lo, NVT), Lo);
+ SDValue LoCLZ = DAG.getNode(ISD::CTLZ, dl, NVT, AdjustedLo);
+ Lo = DAG.getSelect(dl, NVT, IsAllSignBits,
+ DAG.getNode(ISD::ADD, dl, NVT, LoCLZ, ConstantBWM1),
+ HiCTLS);
+ Hi = DAG.getConstant(0, dl, NVT);
+}
+
void DAGTypeLegalizer::ExpandIntRes_ABD(SDNode *N, SDValue &Lo, SDValue &Hi) {
SDValue Result = TLI.expandABD(N, DAG);
SplitInteger(Result, Lo, Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index a39e419e5ad1c..b32b53ef9efe0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -466,6 +466,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
void ExpandIntRes_ABS (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandIntRes_ABD (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandIntRes_CTLZ (SDNode *N, SDValue &Lo, SDValue &Hi);
+ void ExpandIntRes_CTLS(SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandIntRes_CTPOP (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandIntRes_CTTZ (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandIntRes_LOAD (LoadSDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index edba0a7169c0a..c954ec2405ce6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9563,6 +9563,24 @@ SDValue TargetLowering::expandCTLZ(SDNode *Node, SelectionDAG &DAG) const {
return DAG.getNode(ISD::CTPOP, dl, VT, Op);
}
+SDValue TargetLowering::expandCTLS(SDNode *Node, SelectionDAG &DAG) const {
+ SDLoc dl(Node);
+ EVT VT = Node->getValueType(0);
+ EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
+ SDValue Op = Node->getOperand(0);
+ unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+
+ // CTLS(x) = CTLZ(OR(SHL(XOR(x, SRA(x, BW-1)), 1), 1))
+ // This transforms the sign bits into leading zeros that can be counted.
+ SDValue ShiftAmt = DAG.getConstant(NumBitsPerElt - 1, dl, ShVT);
+ SDValue One = DAG.getConstant(1, dl, VT);
+ SDValue SignBit = DAG.getNode(ISD::SRA, dl, VT, Op, ShiftAmt);
+ SDValue Xor = DAG.getNode(ISD::XOR, dl, VT, Op, SignBit);
+ SDValue Shl = DAG.getNode(ISD::SHL, dl, VT, Xor, One);
+ SDValue Or = DAG.getNode(ISD::OR, dl, VT, Shl, One);
+ return DAG.getNode(ISD::CTLZ, dl, VT, Or);
+}
+
SDValue TargetLowering::expandVPCTLZ(SDNode *Node, SelectionDAG &DAG) const {
SDLoc dl(Node);
EVT VT = Node->getValueType(0);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 794a8973c25b7..7cf02c57ebde8 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -222,7 +222,7 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
for (auto Opcode : {ISD::ABS, ISD::ABDS, ISD::ABDU, ISD::SMIN, ISD::SMAX,
- ISD::UMIN, ISD::UMAX})
+ ISD::UMIN, ISD::UMAX, ISD::CTLS})
setOperationAction(Opcode, VT, Legal);
if (!VT.isFloatingPoint())
for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
@@ -276,6 +276,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::UMIN, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
setOperationAction(ISD::ABS, VT, Legal);
+ setOperationAction(ISD::CTLS, VT, Legal);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::MSTORE, VT, Legal);
@@ -1004,9 +1005,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
setOperationAction(ISD::ROTR, VT, Expand);
}
setOperationAction(ISD::CTTZ, MVT::i32, Custom);
- // CTLS (Count Leading Sign bits)
- setOperationAction(ISD::CTLS, MVT::i32, Custom);
- setOperationAction(ISD::CTLS, MVT::i64, Custom);
+ // Note: arm_cls and arm_cls64 intrinsics are expanded directly in
+ // LowerINTRINSIC_WO_CHAIN since there's no native scalar CLS instruction.
+ // Vector CTLS is Legal when NEON/MVE is available (set elsewhere).
// TODO: These two should be set to LibCall, but this currently breaks
// the Linux kernel build. See #101786.
setOperationAction(ISD::CTPOP, MVT::i32, Expand);
@@ -3839,11 +3840,30 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
}
case Intrinsic::arm_cls: {
+ // ARM32 scalar CLS: CTLS(x) = CTLZ(OR(SHL(XOR(x, SRA(x, 31)), 1), 1))
+ // We expand directly here instead of using ISD::CTLS since there's no
+ // native scalar CLS instruction on ARM.
const SDValue &Operand = Op.getOperand(1);
const EVT VTy = Op.getValueType();
- return DAG.getNode(ISD::CTLS, dl, VTy, Operand);
+ SDValue SRA =
+ DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
+ SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
+ SDValue SHL =
+ DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
+ SDValue OR =
+ DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
+ return DAG.getNode(ISD::CTLZ, dl, VTy, OR);
}
case Intrinsic::arm_cls64: {
+ // arm_cls64 returns i32 but takes i64 input.
+ // Use ISD::CTLS for i64 and truncate the result.
+ const SDValue &Operand = Op.getOperand(1);
+ SDValue CTLS64 = DAG.getNode(ISD::CTLS, dl, MVT::i64, Operand);
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, CTLS64);
+ }
+ case Intrinsic::arm_neon_vcls:
+ case Intrinsic::arm_mve_vcls: {
+ // Lower vector CLS intrinsics to ISD::CTLS
const SDValue &Operand = Op.getOperand(1);
const EVT VTy = Op.getValueType();
return DAG.getNode(ISD::CTLS, dl, VTy, Operand);
@@ -6282,61 +6302,6 @@ static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
return Res;
}
-SDValue ARMTargetLowering::LowerCTLS(SDNode *N, SelectionDAG &DAG,
- const ARMSubtarget *ST) const {
- SDLoc dl(N);
- EVT VT = N->getValueType(0);
- SDValue Operand = N->getOperand(0);
-
- if (VT == MVT::i32) {
- // ARM32 scalar CLS: CTLS(x) = CTLZ(OR(SHL(XOR(x, SRA(x, 31)), 1), 1))
- SDValue SRA =
- DAG.getNode(ISD::SRA, dl, VT, Operand, DAG.getConstant(31, dl, VT));
- SDValue XOR = DAG.getNode(ISD::XOR, dl, VT, SRA, Operand);
- SDValue SHL =
- DAG.getNode(ISD::SHL, dl, VT, XOR, DAG.getConstant(1, dl, VT));
- SDValue OR = DAG.getNode(ISD::OR, dl, VT, SHL, DAG.getConstant(1, dl, VT));
- SDValue Result = DAG.getNode(ISD::CTLZ, dl, VT, OR);
- return Result;
- }
-
- if (VT == MVT::i64) {
- // For 64-bit on 32-bit ARM, we need to split into two 32-bit operations
- EVT VT32 = MVT::i32;
- SDValue Lo, Hi;
- std::tie(Lo, Hi) = DAG.SplitScalar(Operand, dl, VT32, VT32);
-
- SDValue Constant0 = DAG.getConstant(0, dl, VT32);
- SDValue Constant1 = DAG.getConstant(1, dl, VT32);
- SDValue Constant31 = DAG.getConstant(31, dl, VT32);
-
- // Compute CTLS of high part
- SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VT32, Hi, Constant31);
- SDValue XORHi = DAG.getNode(ISD::XOR, dl, VT32, SRAHi, Hi);
- SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VT32, XORHi, Constant1);
- SDValue ORHi = DAG.getNode(ISD::OR, dl, VT32, SHLHi, Constant1);
- SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VT32, ORHi);
-
- // Check if CLSHi == 31 (all high bits are sign bits)
- SDValue IsAllSignBits =
- DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::SETEQ);
-
- // If all high bits are sign bits, compute for low part
- SDValue HiIsZero = DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::SETEQ);
- SDValue AdjustedLo =
- DAG.getSelect(dl, VT32, HiIsZero, Lo, DAG.getNOT(dl, Lo, VT32));
- SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VT32, AdjustedLo);
- SDValue Result = DAG.getSelect(
- dl, VT32, IsAllSignBits,
- DAG.getNode(ISD::ADD, dl, VT32, CLZAdjustedLo, Constant31), CLSHi);
-
- return Result;
- }
-
- // Vector types should be handled elsewhere
- return SDValue();
-}
-
/// Getvshiftimm - Check if this is a valid build_vector for the immediate
/// operand of a vector shift operation, where all the elements of the
/// build_vector must have the same constant integer value.
@@ -10379,8 +10344,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRL_PARTS:
case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
case ISD::CTTZ:
- case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
- case ISD::CTLS: return LowerCTLS(Op.getNode(), DAG, Subtarget);
+ case ISD::CTTZ_ZERO_UNDEF:
+ return LowerCTTZ(Op.getNode(), DAG, Subtarget);
case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index b6dc9851b1d56..bc84654f8bd5a 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -608,8 +608,6 @@ class VectorType;
SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCMP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG,
- const ARMSubtarget *ST) const;
Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 1e9c141f13f83..85559c58ad825 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -2494,9 +2494,9 @@ multiclass MVE_VCLSCLZ_p<string opname, bit opcode, MVEVectorVTInfo VTI,
}
}
-defm MVE_VCLSs8 : MVE_VCLSCLZ_p<"cls", 0, MVE_v16s8, int_arm_mve_vcls>;
-defm MVE_VCLSs16 : MVE_VCLSCLZ_p<"cls", 0, MVE_v8s16, int_arm_mve_vcls>;
-defm MVE_VCLSs32 : MVE_VCLSCLZ_p<"cls", 0, MVE_v4s32, int_arm_mve_vcls>;
+defm MVE_VCLSs8 : MVE_VCLSCLZ_p<"cls", 0, MVE_v16s8, ctls>;
+defm MVE_VCLSs16 : MVE_VCLSCLZ_p<"cls", 0, MVE_v8s16, ctls>;
+defm MVE_VCLSs32 : MVE_VCLSCLZ_p<"cls", 0, MVE_v4s32, ctls>;
defm MVE_VCLZs8 : MVE_VCLSCLZ_p<"clz", 1, MVE_v16i8, ctlz>;
defm MVE_VCLZs16 : MVE_VCLSCLZ_p<"clz", 1, MVE_v8i16, ctlz>;
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index dc62a09f942e2..69df2bf0efce2 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -6220,7 +6220,7 @@ defm VQNEG : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0,
// VCLS : Vector Count Leading Sign Bits
defm VCLS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0,
IIC_VCNTiD, IIC_VCNTiQ, "vcls", "s",
- int_arm_neon_vcls>;
+ ctls>;
// VCLZ : Vector Count Leading Zeros
defm VCLZ : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0,
IIC_VCNTiD, IIC_VCNTiQ, "vclz", "i",
diff --git a/llvm/test/CodeGen/ARM/cls.ll b/llvm/test/CodeGen/ARM/cls.ll
index cccb38d0766f3..ab0244a2aeb04 100644
--- a/llvm/test/CodeGen/ARM/cls.ll
+++ b/llvm/test/CodeGen/ARM/cls.ll
@@ -10,11 +10,11 @@ define i32 @cls(i32 %t) {
}
; CHECK: cmp r1, #0
-; CHECK: mvnne [[ADJUSTEDLO:r[0-9]+]], r0
+; CHECK: mvnmi [[ADJUSTEDLO:r[0-9]+]], r0
; CHECK: clz [[CLZLO:r[0-9]+]], [[ADJUSTEDLO]]
; CHECK: eor [[A:r[0-9]+]], r1, r1, asr #31
-; CHECK: mov r1, #1
-; CHECK: orr [[A]], r1, [[A]], lsl #1
+; CHECK: mov [[TMP:r[0-9]+]], #1
+; CHECK: orr [[A]], [[TMP]], [[A]], lsl #1
; CHECK: clz [[CLSHI:r[0-9]+]], [[A]]
; CHECK: cmp [[CLSHI]], #31
; CHECK: addeq r0, [[CLZLO]], #31
diff --git a/llvm/test/CodeGen/ARM/mve-cls.ll b/llvm/test/CodeGen/ARM/mve-cls.ll
new file mode 100644
index 0000000000000..f4cb5bfa9d7f6
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/mve-cls.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m85 -mattr=+mve %s -o - | FileCheck %s
+
+; Test MVE vector CLS (Count Leading Sign bits) operations
+; The intrinsics are lowered to ISD::CTLS and selected to VCLS instructions
+
+define <16 x i8> @test_cls_v16i8(<16 x i8> %a) nounwind {
+; CHECK-LABEL: test_cls_v16i8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d1, r2, r3
+; CHECK-NEXT: vmov d0, r0, r1
+; CHECK-NEXT: vcls.s8 q0, q0
+; CHECK-NEXT: vmov r0, r1, d0
+; CHECK-NEXT: vmov r2, r3, d1
+; CHECK-NEXT: bx lr
+ %result = call <16 x i8> @llvm.arm.mve.vcls.v16i8(<16 x i8> %a)
+ ret <16 x i8> %result
+}
+
+define <8 x i16> @test_cls_v8i16(<8 x i16> %a) nounwind {
+; CHECK-LABEL: test_cls_v8i16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d1, r2, r3
+; CHECK-NEXT: vmov d0, r0, r1
+; CHECK-NEXT: vcls.s16 q0, q0
+; CHECK-NEXT: vmov r0, r1, d0
+; CHECK-NEXT: vmov r2, r3, d1
+; CHECK-NEXT: bx lr
+ %result = call <8 x i16> @llvm.arm.mve.vcls.v8i16(<8 x i16> %a)
+ ret <8 x i16> %result
+}
+
+define <4 x i32> @test_cls_v4i32(<4 x i32> %a) nounwind {
+; CHECK-LABEL: test_cls_v4i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d1, r2, r3
+; CHECK-NEXT: vmov d0, r0, r1
+; CHECK-NEXT: vcls.s32 q0, q0
+; CHECK-NEXT: vmov r0, r1, d0
+; CHECK-NEXT: vmov r2, r3, d1
+; CHECK-NEXT: bx lr
+ %result = call <4 x i32> @llvm.arm.mve.vcls.v4i32(<4 x i32> %a)
+ ret <4 x i32> %result
+}
+
+declare <16 x i8> @llvm.arm.mve.vcls.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.mve.vcls.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.mve.vcls.v4i32(<4 x i32>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM/neon-cls.ll b/llvm/test/CodeGen/ARM/neon-cls.ll
new file mode 100644
index 0000000000000..8113274440d86
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/neon-cls.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
+
+; Test ARM NEON vector CLS (Count Leading Sign bits) operations
+; The intrinsics are lowered to ISD::CTLS and selected to VCLS instructions
+
+define <8 x i8> @test_cls_v8i8(<8 x i8> %a) nounwind {
+; CHECK-LABEL: test_cls_v8i8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s8 d16, d16
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: mov pc, lr
+ %result = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a)
+ ret <8 x i8> %result
+}
+
+define <16 x i8> @test_cls_v16i8(<16 x i8> %a) nounwind {
+; CHECK-LABEL: test_cls_v16i8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s8 q8, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: mov pc, lr
+ %result = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a)
+ ret <16 x i8> %result
+}
+
+define <4 x i16> @test_cls_v4i16(<4 x i16> %a) nounwind {
+; CHECK-LABEL: test_cls_v4i16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s16 d16, d16
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: mov pc, lr
+ %result = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a)
+ ret <4 x i16> %result
+}
+
+define <8 x i16> @test_cls_v8i16(<8 x i16> %a) nounwind {
+; CHECK-LABEL: test_cls_v8i16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s16 q8, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: mov pc, lr
+ %result = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a)
+ ret <8 x i16> %result
+}
+
+define <2 x i32> @test_cls_v2i32(<2 x i32> %a) nounwind {
+; CHECK-LABEL: test_cls_v2i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s32 d16, d16
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: mov pc, lr
+ %result = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a)
+ ret <2 x i32> %result
+}
+
+define <4 x i32> @test_cls_v4i32(<4 x i32> %a) nounwind {
+; CHECK-LABEL: test_cls_v4i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vcls.s32 q8, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: mov pc, lr
+ %result = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a)
+ ret <4 x i32> %result
+}
+
+declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone
More information about the llvm-commits
mailing list