[llvm] [ARM][AArch64] Replace manual CLS expansion with ISD::CTLS (PR #178430)

Fri Jan 30 05:14:32 PST 2026

https://github.com/HamzaHassanain updated https://github.com/llvm/llvm-project/pull/178430

>From 1d7668737a5eaffcd3fcf1ee76a25f33887667c6 Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Wed, 28 Jan 2026 15:55:45 +0200
Subject: [PATCH 1/5] [ARM] Replace manual CLS expansion with ISD::CTLS

Converts ARM scalar CLS intrinsics to use the unified ISD::CTLS node
instead of custom manual expansion. This addresses issue #174337.

Changes:
- int_arm_cls and int_arm_cls64 now return ISD::CTLS nodes
- Added LowerCTLS to handle custom lowering for i32 and i64 CTLS operations
- i32 CTLS expansion generates the same CTLZ-based pattern as before
- i64 CTLS on 32-bit ARM splits into two 32-bit operations
- Added proper setOperationAction for CTLS operations

The assembly output remains identical to the previous manual expansion,
ensuring compatibility with existing code. This unifies the CLS handling
under the generic ISD::CTLS infrastructure for better maintainability.
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp | 95 ++++++++++++++++---------
 llvm/lib/Target/ARM/ARMISelLowering.h   |  2 +
 llvm/test/CodeGen/ARM/mve-cls.ll        | 45 ++++++++++++
 llvm/test/CodeGen/ARM/neon-cls.ll       | 81 +++++++++++++++++++++
 4 files changed, 191 insertions(+), 32 deletions(-)
 create mode 100644 llvm/test/CodeGen/ARM/mve-cls.ll
 create mode 100644 llvm/test/CodeGen/ARM/neon-cls.ll

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 7b240462c66fb..aee3c50b4a476 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1004,6 +1004,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     setOperationAction(ISD::ROTR, VT, Expand);
   }
   setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
+  // CTLS (Count Leading Sign bits)
+  setOperationAction(ISD::CTLS, MVT::i32, Custom);
+  setOperationAction(ISD::CTLS, MVT::i64, Custom);
   // TODO: These two should be set to LibCall, but this currently breaks
   //   the Linux kernel build. See #101786.
   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
@@ -3838,42 +3841,12 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
   case Intrinsic::arm_cls: {
     const SDValue &Operand = Op.getOperand(1);
     const EVT VTy = Op.getValueType();
-    SDValue SRA =
-        DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
-    SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
-    SDValue SHL =
-        DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
-    SDValue OR =
-        DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
-    SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR);
-    return Result;
+    return DAG.getNode(ISD::CTLS, dl, VTy, Operand);
   }
   case Intrinsic::arm_cls64: {
-    // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
-    //          else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
     const SDValue &Operand = Op.getOperand(1);
     const EVT VTy = Op.getValueType();
-    SDValue Lo, Hi;
-    std::tie(Lo, Hi) = DAG.SplitScalar(Operand, dl, VTy, VTy);
-    SDValue Constant0 = DAG.getConstant(0, dl, VTy);
-    SDValue Constant1 = DAG.getConstant(1, dl, VTy);
-    SDValue Constant31 = DAG.getConstant(31, dl, VTy);
-    SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31);
-    SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi);
-    SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1);
-    SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1);
-    SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi);
-    SDValue CheckLo =
-        DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ);
-    SDValue HiIsZero =
-        DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ);
-    SDValue AdjustedLo =
-        DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy));
-    SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo);
-    SDValue Result =
-        DAG.getSelect(dl, VTy, CheckLo,
-                      DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi);
-    return Result;
+    return DAG.getNode(ISD::CTLS, dl, VTy, Operand);
   }
   case Intrinsic::eh_sjlj_lsda: {
     MachineFunction &MF = DAG.getMachineFunction();
@@ -6309,6 +6282,63 @@ static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
   return Res;
 }
 
+static SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG,
+                         const ARMSubtarget *ST) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue Operand = N->getOperand(0);
+
+  if (VT == MVT::i32) {
+    // ARM32 scalar CLS: CTLS(x) = CTLZ(OR(SHL(XOR(x, SRA(x, 31)), 1), 1))
+    SDValue SRA =
+        DAG.getNode(ISD::SRA, dl, VT, Operand, DAG.getConstant(31, dl, VT));
+    SDValue XOR = DAG.getNode(ISD::XOR, dl, VT, SRA, Operand);
+    SDValue SHL =
+        DAG.getNode(ISD::SHL, dl, VT, XOR, DAG.getConstant(1, dl, VT));
+    SDValue OR =
+        DAG.getNode(ISD::OR, dl, VT, SHL, DAG.getConstant(1, dl, VT));
+    SDValue Result = DAG.getNode(ISD::CTLZ, dl, VT, OR);
+    return Result;
+  }
+
+  if (VT == MVT::i64) {
+    // For 64-bit on 32-bit ARM, we need to split into two 32-bit operations
+    EVT VT32 = MVT::i32;
+    SDValue Lo, Hi;
+    std::tie(Lo, Hi) = DAG.SplitScalar(Operand, dl, VT32, VT32);
+
+    SDValue Constant0 = DAG.getConstant(0, dl, VT32);
+    SDValue Constant1 = DAG.getConstant(1, dl, VT32);
+    SDValue Constant31 = DAG.getConstant(31, dl, VT32);
+
+    // Compute CTLS of high part
+    SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VT32, Hi, Constant31);
+    SDValue XORHi = DAG.getNode(ISD::XOR, dl, VT32, SRAHi, Hi);
+    SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VT32, XORHi, Constant1);
+    SDValue ORHi = DAG.getNode(ISD::OR, dl, VT32, SHLHi, Constant1);
+    SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VT32, ORHi);
+
+    // Check if CLSHi == 31 (all high bits are sign bits)
+    SDValue IsAllSignBits =
+        DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::SETEQ);
+
+    // If all high bits are sign bits, compute for low part
+    SDValue HiIsZero =
+        DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::SETEQ);
+    SDValue AdjustedLo =
+        DAG.getSelect(dl, VT32, HiIsZero, Lo, DAG.getNOT(dl, Lo, VT32));
+    SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VT32, AdjustedLo);
+    SDValue Result =
+        DAG.getSelect(dl, VT32, IsAllSignBits,
+                      DAG.getNode(ISD::ADD, dl, VT32, CLZAdjustedLo, Constant31), CLSHi);
+
+    return Result;
+  }
+
+  // Vector types should be handled elsewhere
+  return SDValue();
+}
+
 /// Getvshiftimm - Check if this is a valid build_vector for the immediate
 /// operand of a vector shift operation, where all the elements of the
 /// build_vector must have the same constant integer value.
@@ -10352,6 +10382,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
+  case ISD::CTLS:           return LowerCTLS(Op.getNode(), DAG, Subtarget);
   case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
   case ISD::SETCC:         return LowerVSETCC(Op, DAG, Subtarget);
   case ISD::SETCCCARRY:    return LowerSETCCCARRY(Op, DAG);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index bc84654f8bd5a..b6dc9851b1d56 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -608,6 +608,8 @@ class VectorType;
     SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerCMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG,
+                      const ARMSubtarget *ST) const;
 
     Register getRegisterByName(const char* RegName, LLT VT,
                                const MachineFunction &MF) const override;
diff --git a/llvm/test/CodeGen/ARM/mve-cls.ll b/llvm/test/CodeGen/ARM/mve-cls.ll
new file mode 100644
index 0000000000000..cbf708f637992
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/mve-cls.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m85 -mattr=+mve %s -o - | FileCheck %s
+
+define <16 x i8> @test_cls_v16s8(<16 x i8> %a) nounwind {
+; CHECK-LABEL: test_cls_v16s8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov    d1, r2, r3
+; CHECK-NEXT:    vmov    d0, r0, r1
+; CHECK-NEXT:    vcls.s8 q0, q0
+; CHECK-NEXT:    vmov    r0, r1, d0
+; CHECK-NEXT:    vmov    r2, r3, d1
+; CHECK-NEXT:    bx      lr
+  %result = call <16 x i8> @llvm.arm.mve.vcls.v16s8(<16 x i8> %a)
+  ret <16 x i8> %result
+}
+
+define <8 x i16> @test_cls_v8s16(<8 x i16> %a) nounwind {
+; CHECK-LABEL: test_cls_v8s16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov    d1, r2, r3
+; CHECK-NEXT:    vmov    d0, r0, r1
+; CHECK-NEXT:    vcls.s16        q0, q0
+; CHECK-NEXT:    vmov    r0, r1, d0
+; CHECK-NEXT:    vmov    r2, r3, d1
+; CHECK-NEXT:    bx      lr
+  %result = call <8 x i16> @llvm.arm.mve.vcls.v8s16(<8 x i16> %a)
+  ret <8 x i16> %result
+}
+
+define <4 x i32> @test_cls_v4s32(<4 x i32> %a) nounwind {
+; CHECK-LABEL: test_cls_v4s32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov    d1, r2, r3
+; CHECK-NEXT:    vmov    d0, r0, r1
+; CHECK-NEXT:    vcls.s32        q0, q0
+; CHECK-NEXT:    vmov    r0, r1, d0
+; CHECK-NEXT:    vmov    r2, r3, d1
+; CHECK-NEXT:    bx      lr
+  %result = call <4 x i32> @llvm.arm.mve.vcls.v4s32(<4 x i32> %a)
+  ret <4 x i32> %result
+}
+
+declare <16 x i8> @llvm.arm.mve.vcls.v16s8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.mve.vcls.v8s16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.mve.vcls.v4s32(<4 x i32>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM/neon-cls.ll b/llvm/test/CodeGen/ARM/neon-cls.ll
new file mode 100644
index 0000000000000..4667b87a6f7bd
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/neon-cls.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
+
+define <8 x i8> @test_cls_v8i8(<8 x i8> %a) nounwind {
+; CHECK-LABEL: test_cls_v8i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vcls.s8 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
+  %result = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a)
+  ret <8 x i8> %result
+}
+
+define <16 x i8> @test_cls_v16i8(<16 x i8> %a) nounwind {
+; CHECK-LABEL: test_cls_v16i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vcls.s8 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
+  %result = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a)
+  ret <16 x i8> %result
+}
+
+define <4 x i16> @test_cls_v4i16(<4 x i16> %a) nounwind {
+; CHECK-LABEL: test_cls_v4i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vcls.s16 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
+  %result = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a)
+  ret <4 x i16> %result
+}
+
+define <8 x i16> @test_cls_v8i16(<8 x i16> %a) nounwind {
+; CHECK-LABEL: test_cls_v8i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vcls.s16 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
+  %result = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a)
+  ret <8 x i16> %result
+}
+
+define <2 x i32> @test_cls_v2i32(<2 x i32> %a) nounwind {
+; CHECK-LABEL: test_cls_v2i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vcls.s32 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
+  %result = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a)
+  ret <2 x i32> %result
+}
+
+define <4 x i32> @test_cls_v4i32(<4 x i32> %a) nounwind {
+; CHECK-LABEL: test_cls_v4i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vcls.s32 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
+  %result = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a)
+  ret <4 x i32> %result
+}
+
+declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone

>From c5b4e46db56cd62536475362b569b59bd8fa92b5 Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Wed, 28 Jan 2026 16:24:46 +0200
Subject: [PATCH 2/5] style: fix foramt errors

---
 llvm/lib/Target/ARM/ARMISelLowering.cpp | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index aee3c50b4a476..c8356b386a68d 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -6282,8 +6282,7 @@ static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
   return Res;
 }
 
-static SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG,
-                         const ARMSubtarget *ST) {
+static SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   SDValue Operand = N->getOperand(0);
@@ -6295,8 +6294,7 @@ static SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG,
     SDValue XOR = DAG.getNode(ISD::XOR, dl, VT, SRA, Operand);
     SDValue SHL =
         DAG.getNode(ISD::SHL, dl, VT, XOR, DAG.getConstant(1, dl, VT));
-    SDValue OR =
-        DAG.getNode(ISD::OR, dl, VT, SHL, DAG.getConstant(1, dl, VT));
+    SDValue OR = DAG.getNode(ISD::OR, dl, VT, SHL, DAG.getConstant(1, dl, VT));
     SDValue Result = DAG.getNode(ISD::CTLZ, dl, VT, OR);
     return Result;
   }
@@ -6323,14 +6321,13 @@ static SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG,
         DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::SETEQ);
 
     // If all high bits are sign bits, compute for low part
-    SDValue HiIsZero =
-        DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::SETEQ);
+    SDValue HiIsZero = DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::SETEQ);
     SDValue AdjustedLo =
         DAG.getSelect(dl, VT32, HiIsZero, Lo, DAG.getNOT(dl, Lo, VT32));
     SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VT32, AdjustedLo);
-    SDValue Result =
-        DAG.getSelect(dl, VT32, IsAllSignBits,
-                      DAG.getNode(ISD::ADD, dl, VT32, CLZAdjustedLo, Constant31), CLSHi);
+    SDValue Result = DAG.getSelect(
+        dl, VT32, IsAllSignBits,
+        DAG.getNode(ISD::ADD, dl, VT32, CLZAdjustedLo, Constant31), CLSHi);
 
     return Result;
   }
@@ -10382,7 +10379,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
-  case ISD::CTLS:           return LowerCTLS(Op.getNode(), DAG, Subtarget);
+  case ISD::CTLS:
+    return LowerCTLS(Op.getNode(), DAG, Subtarget);
   case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
   case ISD::SETCC:         return LowerVSETCC(Op, DAG, Subtarget);
   case ISD::SETCCCARRY:    return LowerSETCCCARRY(Op, DAG);

>From 3b4d53594195f82814e8aa6b999ef37e9d73912b Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Thu, 29 Jan 2026 14:47:37 +0200
Subject: [PATCH 3/5] Fix: Change LowerCTLS from static to member function

The LowerCTLS function was declared as a member function in the header
but defined as a static file-scope function in the cpp file. This caused
a compilation error (unused function warning treated as error) because
the compiler couldn't resolve the function call properly.

Changed the definition to match the header declaration as a const member
function of ARMTargetLowering class.
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index c8356b386a68d..794a8973c25b7 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -6282,7 +6282,8 @@ static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
   return Res;
 }
 
-static SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) {
+SDValue ARMTargetLowering::LowerCTLS(SDNode *N, SelectionDAG &DAG,
+                                       const ARMSubtarget *ST) const {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   SDValue Operand = N->getOperand(0);
@@ -10379,8 +10380,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
-  case ISD::CTLS:
-    return LowerCTLS(Op.getNode(), DAG, Subtarget);
+  case ISD::CTLS:           return LowerCTLS(Op.getNode(), DAG, Subtarget);
   case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
   case ISD::SETCC:         return LowerVSETCC(Op, DAG, Subtarget);
   case ISD::SETCCCARRY:    return LowerSETCCCARRY(Op, DAG);

>From 061b4afcad82d7ad5d5749401d612ea95a4ca90a Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Thu, 29 Jan 2026 15:07:04 +0200
Subject: [PATCH 4/5] Remove vector CLS tests - vector implementation is in a
 separate commit

The vector CLS intrinsics conversion tests should only be included after
the vector support has been fully implemented with proper ISD::CTLS Legal
actions for NEON and MVE types. Those changes are in commit e09ae3fa5fa2.
---
 llvm/test/CodeGen/ARM/mve-cls.ll  | 45 -----------------
 llvm/test/CodeGen/ARM/neon-cls.ll | 81 -------------------------------
 2 files changed, 126 deletions(-)
 delete mode 100644 llvm/test/CodeGen/ARM/mve-cls.ll
 delete mode 100644 llvm/test/CodeGen/ARM/neon-cls.ll

diff --git a/llvm/test/CodeGen/ARM/mve-cls.ll b/llvm/test/CodeGen/ARM/mve-cls.ll
deleted file mode 100644
index cbf708f637992..0000000000000
--- a/llvm/test/CodeGen/ARM/mve-cls.ll
+++ /dev/null
@@ -1,45 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m85 -mattr=+mve %s -o - | FileCheck %s
-
-define <16 x i8> @test_cls_v16s8(<16 x i8> %a) nounwind {
-; CHECK-LABEL: test_cls_v16s8:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov    d1, r2, r3
-; CHECK-NEXT:    vmov    d0, r0, r1
-; CHECK-NEXT:    vcls.s8 q0, q0
-; CHECK-NEXT:    vmov    r0, r1, d0
-; CHECK-NEXT:    vmov    r2, r3, d1
-; CHECK-NEXT:    bx      lr
-  %result = call <16 x i8> @llvm.arm.mve.vcls.v16s8(<16 x i8> %a)
-  ret <16 x i8> %result
-}
-
-define <8 x i16> @test_cls_v8s16(<8 x i16> %a) nounwind {
-; CHECK-LABEL: test_cls_v8s16:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov    d1, r2, r3
-; CHECK-NEXT:    vmov    d0, r0, r1
-; CHECK-NEXT:    vcls.s16        q0, q0
-; CHECK-NEXT:    vmov    r0, r1, d0
-; CHECK-NEXT:    vmov    r2, r3, d1
-; CHECK-NEXT:    bx      lr
-  %result = call <8 x i16> @llvm.arm.mve.vcls.v8s16(<8 x i16> %a)
-  ret <8 x i16> %result
-}
-
-define <4 x i32> @test_cls_v4s32(<4 x i32> %a) nounwind {
-; CHECK-LABEL: test_cls_v4s32:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov    d1, r2, r3
-; CHECK-NEXT:    vmov    d0, r0, r1
-; CHECK-NEXT:    vcls.s32        q0, q0
-; CHECK-NEXT:    vmov    r0, r1, d0
-; CHECK-NEXT:    vmov    r2, r3, d1
-; CHECK-NEXT:    bx      lr
-  %result = call <4 x i32> @llvm.arm.mve.vcls.v4s32(<4 x i32> %a)
-  ret <4 x i32> %result
-}
-
-declare <16 x i8> @llvm.arm.mve.vcls.v16s8(<16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm.mve.vcls.v8s16(<8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm.mve.vcls.v4s32(<4 x i32>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM/neon-cls.ll b/llvm/test/CodeGen/ARM/neon-cls.ll
deleted file mode 100644
index 4667b87a6f7bd..0000000000000
--- a/llvm/test/CodeGen/ARM/neon-cls.ll
+++ /dev/null
@@ -1,81 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
-
-define <8 x i8> @test_cls_v8i8(<8 x i8> %a) nounwind {
-; CHECK-LABEL: test_cls_v8i8:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov d16, r0, r1
-; CHECK-NEXT:    vcls.s8 d16, d16
-; CHECK-NEXT:    vmov r0, r1, d16
-; CHECK-NEXT:    mov pc, lr
-  %result = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a)
-  ret <8 x i8> %result
-}
-
-define <16 x i8> @test_cls_v16i8(<16 x i8> %a) nounwind {
-; CHECK-LABEL: test_cls_v16i8:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov d17, r2, r3
-; CHECK-NEXT:    vmov d16, r0, r1
-; CHECK-NEXT:    vcls.s8 q8, q8
-; CHECK-NEXT:    vmov r0, r1, d16
-; CHECK-NEXT:    vmov r2, r3, d17
-; CHECK-NEXT:    mov pc, lr
-  %result = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a)
-  ret <16 x i8> %result
-}
-
-define <4 x i16> @test_cls_v4i16(<4 x i16> %a) nounwind {
-; CHECK-LABEL: test_cls_v4i16:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov d16, r0, r1
-; CHECK-NEXT:    vcls.s16 d16, d16
-; CHECK-NEXT:    vmov r0, r1, d16
-; CHECK-NEXT:    mov pc, lr
-  %result = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a)
-  ret <4 x i16> %result
-}
-
-define <8 x i16> @test_cls_v8i16(<8 x i16> %a) nounwind {
-; CHECK-LABEL: test_cls_v8i16:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov d17, r2, r3
-; CHECK-NEXT:    vmov d16, r0, r1
-; CHECK-NEXT:    vcls.s16 q8, q8
-; CHECK-NEXT:    vmov r0, r1, d16
-; CHECK-NEXT:    vmov r2, r3, d17
-; CHECK-NEXT:    mov pc, lr
-  %result = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a)
-  ret <8 x i16> %result
-}
-
-define <2 x i32> @test_cls_v2i32(<2 x i32> %a) nounwind {
-; CHECK-LABEL: test_cls_v2i32:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov d16, r0, r1
-; CHECK-NEXT:    vcls.s32 d16, d16
-; CHECK-NEXT:    vmov r0, r1, d16
-; CHECK-NEXT:    mov pc, lr
-  %result = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a)
-  ret <2 x i32> %result
-}
-
-define <4 x i32> @test_cls_v4i32(<4 x i32> %a) nounwind {
-; CHECK-LABEL: test_cls_v4i32:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov d17, r2, r3
-; CHECK-NEXT:    vmov d16, r0, r1
-; CHECK-NEXT:    vcls.s32 q8, q8
-; CHECK-NEXT:    vmov r0, r1, d16
-; CHECK-NEXT:    vmov r2, r3, d17
-; CHECK-NEXT:    mov pc, lr
-  %result = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a)
-  ret <4 x i32> %result
-}
-
-declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone

>From ecde030395dd116390bc72bda834671a89f1e400 Mon Sep 17 00:00:00 2001
From: Hamza Hassanain <hamzahassanain067 at gmail.com>
Date: Thu, 29 Jan 2026 19:36:45 +0200
Subject: [PATCH 5/5] [ARM] Convert CLS intrinsics to use ISD::CTLS

This patch converts ARM CLS intrinsics (arm_cls, arm_cls64, arm_neon_vcls,
arm_mve_vcls) to use the generic ISD::CTLS node.

- arm_cls: Expanded directly in LowerINTRINSIC_WO_CHAIN (no native scalar CLS)
- arm_cls64: Uses ISD::CTLS with TRUNCATE, relying on ExpandIntRes_CTLS
- arm_neon_vcls: Lowered to ISD::CTLS, pattern-matched to VCLS instruction
- arm_mve_vcls: Lowered to ISD::CTLS, pattern-matched to MVE VCLS instruction

Also adds generic CTLS expansion support:
- ExpandIntRes_CTLS in LegalizeIntegerTypes for i64->i32 type expansion
- expandCTLS in TargetLowering for targets without native CLS instruction

Part of: https://github.com/llvm/llvm-project/issues/174337
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  6 ++
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  4 +
 .../SelectionDAG/LegalizeIntegerTypes.cpp     | 29 ++++++
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |  1 +
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 18 ++++
 llvm/lib/Target/ARM/ARMISelLowering.cpp       | 89 ++++++-------------
 llvm/lib/Target/ARM/ARMISelLowering.h         |  2 -
 llvm/lib/Target/ARM/ARMInstrMVE.td            |  6 +-
 llvm/lib/Target/ARM/ARMInstrNEON.td           |  2 +-
 llvm/test/CodeGen/ARM/cls.ll                  |  6 +-
 llvm/test/CodeGen/ARM/mve-cls.ll              | 48 ++++++++++
 llvm/test/CodeGen/ARM/neon-cls.ll             | 84 +++++++++++++++++
 12 files changed, 224 insertions(+), 71 deletions(-)
 create mode 100644 llvm/test/CodeGen/ARM/mve-cls.ll
 create mode 100644 llvm/test/CodeGen/ARM/neon-cls.ll

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index df0fced112f73..f4010848aa8c4 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5583,6 +5583,12 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
   /// \returns The expansion result or SDValue() if it fails.
   SDValue expandCTLZ(SDNode *N, SelectionDAG &DAG) const;
 
+  /// Expand CTLS (count leading sign bits) nodes.
+  /// CTLS(x) = CTLZ(OR(SHL(XOR(x, SRA(x, BW-1)), 1), 1))
+  /// \param N Node to expand
+  /// \returns The expansion result or SDValue() if it fails.
+  SDValue expandCTLS(SDNode *N, SelectionDAG &DAG) const;
+
   /// Expand VP_CTLZ/VP_CTLZ_ZERO_UNDEF nodes.
   /// \param N Node to expand
   /// \returns The expansion result or SDValue() if it fails.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index d9a2409b35e4c..df3ee52b42638 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3218,6 +3218,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     if ((Tmp1 = TLI.expandCTLZ(Node, DAG)))
       Results.push_back(Tmp1);
     break;
+  case ISD::CTLS:
+    if ((Tmp1 = TLI.expandCTLS(Node, DAG)))
+      Results.push_back(Tmp1);
+    break;
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:
     if ((Tmp1 = TLI.expandCTTZ(Node, DAG)))
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 8ce41df6be69b..586cb6878869d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -3066,6 +3066,9 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::ABDU:        ExpandIntRes_ABD(N, Lo, Hi); break;
   case ISD::CTLZ_ZERO_UNDEF:
   case ISD::CTLZ:        ExpandIntRes_CTLZ(N, Lo, Hi); break;
+  case ISD::CTLS:
+    ExpandIntRes_CTLS(N, Lo, Hi);
+    break;
   case ISD::CTPOP:       ExpandIntRes_CTPOP(N, Lo, Hi); break;
   case ISD::CTTZ_ZERO_UNDEF:
   case ISD::CTTZ:        ExpandIntRes_CTTZ(N, Lo, Hi); break;
@@ -4150,6 +4153,32 @@ void DAGTypeLegalizer::ExpandIntRes_CTLZ(SDNode *N,
   Hi = DAG.getConstant(0, dl, NVT);
 }
 
+void DAGTypeLegalizer::ExpandIntRes_CTLS(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  // ctls(HiLo) -> if (IsAllSignBits = (ctls(Hi) == BW-1)) then
+  //                 BW-1 + clz(IsNegative = (Hi < 0) ? ~Lo : Lo)
+  //               else ctls(Hi)
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+  unsigned NVTBits = NVT.getSizeInBits();
+
+  SDValue Constant0 = DAG.getConstant(0, dl, NVT);
+  SDValue ConstantBWM1 = DAG.getConstant(NVTBits - 1, dl, NVT);
+
+  SDValue HiCTLS = DAG.getNode(ISD::CTLS, dl, NVT, Hi);
+  SDValue IsAllSignBits = DAG.getSetCC(dl, getSetCCResultType(NVT), HiCTLS,
+                                       ConstantBWM1, ISD::SETEQ);
+  SDValue IsNegative =
+      DAG.getSetCC(dl, getSetCCResultType(NVT), Hi, Constant0, ISD::SETLT);
+  SDValue AdjustedLo =
+      DAG.getSelect(dl, NVT, IsNegative, DAG.getNOT(dl, Lo, NVT), Lo);
+  SDValue LoCLZ = DAG.getNode(ISD::CTLZ, dl, NVT, AdjustedLo);
+  Lo = DAG.getSelect(dl, NVT, IsAllSignBits,
+                     DAG.getNode(ISD::ADD, dl, NVT, LoCLZ, ConstantBWM1),
+                     HiCTLS);
+  Hi = DAG.getConstant(0, dl, NVT);
+}
+
 void DAGTypeLegalizer::ExpandIntRes_ABD(SDNode *N, SDValue &Lo, SDValue &Hi) {
   SDValue Result = TLI.expandABD(N, DAG);
   SplitInteger(Result, Lo, Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index a39e419e5ad1c..b32b53ef9efe0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -466,6 +466,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void ExpandIntRes_ABS               (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_ABD               (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_CTLZ              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_CTLS(SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_CTPOP             (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_CTTZ              (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_LOAD          (LoadSDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index edba0a7169c0a..c954ec2405ce6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9563,6 +9563,24 @@ SDValue TargetLowering::expandCTLZ(SDNode *Node, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::CTPOP, dl, VT, Op);
 }
 
+SDValue TargetLowering::expandCTLS(SDNode *Node, SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
+  SDValue Op = Node->getOperand(0);
+  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+
+  // CTLS(x) = CTLZ(OR(SHL(XOR(x, SRA(x, BW-1)), 1), 1))
+  // This transforms the sign bits into leading zeros that can be counted.
+  SDValue ShiftAmt = DAG.getConstant(NumBitsPerElt - 1, dl, ShVT);
+  SDValue One = DAG.getConstant(1, dl, VT);
+  SDValue SignBit = DAG.getNode(ISD::SRA, dl, VT, Op, ShiftAmt);
+  SDValue Xor = DAG.getNode(ISD::XOR, dl, VT, Op, SignBit);
+  SDValue Shl = DAG.getNode(ISD::SHL, dl, VT, Xor, One);
+  SDValue Or = DAG.getNode(ISD::OR, dl, VT, Shl, One);
+  return DAG.getNode(ISD::CTLZ, dl, VT, Or);
+}
+
 SDValue TargetLowering::expandVPCTLZ(SDNode *Node, SelectionDAG &DAG) const {
   SDLoc dl(Node);
   EVT VT = Node->getValueType(0);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 794a8973c25b7..7cf02c57ebde8 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -222,7 +222,7 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
 
   if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
     for (auto Opcode : {ISD::ABS, ISD::ABDS, ISD::ABDU, ISD::SMIN, ISD::SMAX,
-                        ISD::UMIN, ISD::UMAX})
+                        ISD::UMIN, ISD::UMAX, ISD::CTLS})
       setOperationAction(Opcode, VT, Legal);
   if (!VT.isFloatingPoint())
     for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
@@ -276,6 +276,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::UMIN, VT, Legal);
     setOperationAction(ISD::UMAX, VT, Legal);
     setOperationAction(ISD::ABS, VT, Legal);
+    setOperationAction(ISD::CTLS, VT, Legal);
     setOperationAction(ISD::SETCC, VT, Custom);
     setOperationAction(ISD::MLOAD, VT, Custom);
     setOperationAction(ISD::MSTORE, VT, Legal);
@@ -1004,9 +1005,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     setOperationAction(ISD::ROTR, VT, Expand);
   }
   setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
-  // CTLS (Count Leading Sign bits)
-  setOperationAction(ISD::CTLS, MVT::i32, Custom);
-  setOperationAction(ISD::CTLS, MVT::i64, Custom);
+  // Note: arm_cls and arm_cls64 intrinsics are expanded directly in
+  // LowerINTRINSIC_WO_CHAIN since there's no native scalar CLS instruction.
+  // Vector CTLS is Legal when NEON/MVE is available (set elsewhere).
   // TODO: These two should be set to LibCall, but this currently breaks
   //   the Linux kernel build. See #101786.
   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
@@ -3839,11 +3840,30 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
     return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
   }
   case Intrinsic::arm_cls: {
+    // ARM32 scalar CLS: CTLS(x) = CTLZ(OR(SHL(XOR(x, SRA(x, 31)), 1), 1))
+    // We expand directly here instead of using ISD::CTLS since there's no
+    // native scalar CLS instruction on ARM.
     const SDValue &Operand = Op.getOperand(1);
     const EVT VTy = Op.getValueType();
-    return DAG.getNode(ISD::CTLS, dl, VTy, Operand);
+    SDValue SRA =
+        DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
+    SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
+    SDValue SHL =
+        DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
+    SDValue OR =
+        DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
+    return DAG.getNode(ISD::CTLZ, dl, VTy, OR);
   }
   case Intrinsic::arm_cls64: {
+    // arm_cls64 returns i32 but takes i64 input.
+    // Use ISD::CTLS for i64 and truncate the result.
+    const SDValue &Operand = Op.getOperand(1);
+    SDValue CTLS64 = DAG.getNode(ISD::CTLS, dl, MVT::i64, Operand);
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, CTLS64);
+  }
+  case Intrinsic::arm_neon_vcls:
+  case Intrinsic::arm_mve_vcls: {
+    // Lower vector CLS intrinsics to ISD::CTLS
     const SDValue &Operand = Op.getOperand(1);
     const EVT VTy = Op.getValueType();
     return DAG.getNode(ISD::CTLS, dl, VTy, Operand);
@@ -6282,61 +6302,6 @@ static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
   return Res;
 }
 
-SDValue ARMTargetLowering::LowerCTLS(SDNode *N, SelectionDAG &DAG,
-                                       const ARMSubtarget *ST) const {
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-  SDValue Operand = N->getOperand(0);
-
-  if (VT == MVT::i32) {
-    // ARM32 scalar CLS: CTLS(x) = CTLZ(OR(SHL(XOR(x, SRA(x, 31)), 1), 1))
-    SDValue SRA =
-        DAG.getNode(ISD::SRA, dl, VT, Operand, DAG.getConstant(31, dl, VT));
-    SDValue XOR = DAG.getNode(ISD::XOR, dl, VT, SRA, Operand);
-    SDValue SHL =
-        DAG.getNode(ISD::SHL, dl, VT, XOR, DAG.getConstant(1, dl, VT));
-    SDValue OR = DAG.getNode(ISD::OR, dl, VT, SHL, DAG.getConstant(1, dl, VT));
-    SDValue Result = DAG.getNode(ISD::CTLZ, dl, VT, OR);
-    return Result;
-  }
-
-  if (VT == MVT::i64) {
-    // For 64-bit on 32-bit ARM, we need to split into two 32-bit operations
-    EVT VT32 = MVT::i32;
-    SDValue Lo, Hi;
-    std::tie(Lo, Hi) = DAG.SplitScalar(Operand, dl, VT32, VT32);
-
-    SDValue Constant0 = DAG.getConstant(0, dl, VT32);
-    SDValue Constant1 = DAG.getConstant(1, dl, VT32);
-    SDValue Constant31 = DAG.getConstant(31, dl, VT32);
-
-    // Compute CTLS of high part
-    SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VT32, Hi, Constant31);
-    SDValue XORHi = DAG.getNode(ISD::XOR, dl, VT32, SRAHi, Hi);
-    SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VT32, XORHi, Constant1);
-    SDValue ORHi = DAG.getNode(ISD::OR, dl, VT32, SHLHi, Constant1);
-    SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VT32, ORHi);
-
-    // Check if CLSHi == 31 (all high bits are sign bits)
-    SDValue IsAllSignBits =
-        DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::SETEQ);
-
-    // If all high bits are sign bits, compute for low part
-    SDValue HiIsZero = DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::SETEQ);
-    SDValue AdjustedLo =
-        DAG.getSelect(dl, VT32, HiIsZero, Lo, DAG.getNOT(dl, Lo, VT32));
-    SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VT32, AdjustedLo);
-    SDValue Result = DAG.getSelect(
-        dl, VT32, IsAllSignBits,
-        DAG.getNode(ISD::ADD, dl, VT32, CLZAdjustedLo, Constant31), CLSHi);
-
-    return Result;
-  }
-
-  // Vector types should be handled elsewhere
-  return SDValue();
-}
-
 /// Getvshiftimm - Check if this is a valid build_vector for the immediate
 /// operand of a vector shift operation, where all the elements of the
 /// build_vector must have the same constant integer value.
@@ -10379,8 +10344,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SRL_PARTS:
   case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
   case ISD::CTTZ:
-  case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
-  case ISD::CTLS:           return LowerCTLS(Op.getNode(), DAG, Subtarget);
+  case ISD::CTTZ_ZERO_UNDEF:
+    return LowerCTTZ(Op.getNode(), DAG, Subtarget);
   case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
   case ISD::SETCC:         return LowerVSETCC(Op, DAG, Subtarget);
   case ISD::SETCCCARRY:    return LowerSETCCCARRY(Op, DAG);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index b6dc9851b1d56..bc84654f8bd5a 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -608,8 +608,6 @@ class VectorType;
     SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerCMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerCTLS(SDNode *N, SelectionDAG &DAG,
-                      const ARMSubtarget *ST) const;
 
     Register getRegisterByName(const char* RegName, LLT VT,
                                const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 1e9c141f13f83..85559c58ad825 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -2494,9 +2494,9 @@ multiclass MVE_VCLSCLZ_p<string opname, bit opcode, MVEVectorVTInfo VTI,
   }
 }
 
-defm MVE_VCLSs8  : MVE_VCLSCLZ_p<"cls", 0, MVE_v16s8, int_arm_mve_vcls>;
-defm MVE_VCLSs16 : MVE_VCLSCLZ_p<"cls", 0, MVE_v8s16, int_arm_mve_vcls>;
-defm MVE_VCLSs32 : MVE_VCLSCLZ_p<"cls", 0, MVE_v4s32, int_arm_mve_vcls>;
+defm MVE_VCLSs8  : MVE_VCLSCLZ_p<"cls", 0, MVE_v16s8, ctls>;
+defm MVE_VCLSs16 : MVE_VCLSCLZ_p<"cls", 0, MVE_v8s16, ctls>;
+defm MVE_VCLSs32 : MVE_VCLSCLZ_p<"cls", 0, MVE_v4s32, ctls>;
 
 defm MVE_VCLZs8  : MVE_VCLSCLZ_p<"clz", 1, MVE_v16i8, ctlz>;
 defm MVE_VCLZs16 : MVE_VCLSCLZ_p<"clz", 1, MVE_v8i16, ctlz>;
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index dc62a09f942e2..69df2bf0efce2 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -6220,7 +6220,7 @@ defm VQNEG    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0,
 //   VCLS     : Vector Count Leading Sign Bits
 defm VCLS     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0,
                            IIC_VCNTiD, IIC_VCNTiQ, "vcls", "s",
-                           int_arm_neon_vcls>;
+                           ctls>;
 //   VCLZ     : Vector Count Leading Zeros
 defm VCLZ     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0,
                            IIC_VCNTiD, IIC_VCNTiQ, "vclz", "i",
diff --git a/llvm/test/CodeGen/ARM/cls.ll b/llvm/test/CodeGen/ARM/cls.ll
index cccb38d0766f3..ab0244a2aeb04 100644
--- a/llvm/test/CodeGen/ARM/cls.ll
+++ b/llvm/test/CodeGen/ARM/cls.ll
@@ -10,11 +10,11 @@ define i32 @cls(i32 %t) {
 }
 
 ; CHECK: cmp r1, #0
-; CHECK: mvnne [[ADJUSTEDLO:r[0-9]+]], r0
+; CHECK: mvnmi [[ADJUSTEDLO:r[0-9]+]], r0
 ; CHECK: clz [[CLZLO:r[0-9]+]], [[ADJUSTEDLO]]
 ; CHECK: eor [[A:r[0-9]+]], r1, r1, asr #31
-; CHECK: mov r1, #1
-; CHECK: orr [[A]], r1, [[A]], lsl #1
+; CHECK: mov [[TMP:r[0-9]+]], #1
+; CHECK: orr [[A]], [[TMP]], [[A]], lsl #1
 ; CHECK: clz [[CLSHI:r[0-9]+]], [[A]]
 ; CHECK: cmp [[CLSHI]], #31
 ; CHECK: addeq r0, [[CLZLO]], #31
diff --git a/llvm/test/CodeGen/ARM/mve-cls.ll b/llvm/test/CodeGen/ARM/mve-cls.ll
new file mode 100644
index 0000000000000..f4cb5bfa9d7f6
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/mve-cls.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m85 -mattr=+mve %s -o - | FileCheck %s
+
+; Test MVE vector CLS (Count Leading Sign bits) operations
+; The intrinsics are lowered to ISD::CTLS and selected to VCLS instructions
+
+define <16 x i8> @test_cls_v16i8(<16 x i8> %a) nounwind {
+; CHECK-LABEL: test_cls_v16i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d1, r2, r3
+; CHECK-NEXT:    vmov d0, r0, r1
+; CHECK-NEXT:    vcls.s8 q0, q0
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    bx lr
+  %result = call <16 x i8> @llvm.arm.mve.vcls.v16i8(<16 x i8> %a)
+  ret <16 x i8> %result
+}
+
+define <8 x i16> @test_cls_v8i16(<8 x i16> %a) nounwind {
+; CHECK-LABEL: test_cls_v8i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d1, r2, r3
+; CHECK-NEXT:    vmov d0, r0, r1
+; CHECK-NEXT:    vcls.s16 q0, q0
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    bx lr
+  %result = call <8 x i16> @llvm.arm.mve.vcls.v8i16(<8 x i16> %a)
+  ret <8 x i16> %result
+}
+
+define <4 x i32> @test_cls_v4i32(<4 x i32> %a) nounwind {
+; CHECK-LABEL: test_cls_v4i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d1, r2, r3
+; CHECK-NEXT:    vmov d0, r0, r1
+; CHECK-NEXT:    vcls.s32 q0, q0
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    bx lr
+  %result = call <4 x i32> @llvm.arm.mve.vcls.v4i32(<4 x i32> %a)
+  ret <4 x i32> %result
+}
+
+declare <16 x i8> @llvm.arm.mve.vcls.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.mve.vcls.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.mve.vcls.v4i32(<4 x i32>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM/neon-cls.ll b/llvm/test/CodeGen/ARM/neon-cls.ll
new file mode 100644
index 0000000000000..8113274440d86
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/neon-cls.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
+
+; Test ARM NEON vector CLS (Count Leading Sign bits) operations
+; The intrinsics are lowered to ISD::CTLS and selected to VCLS instructions
+
+define <8 x i8> @test_cls_v8i8(<8 x i8> %a) nounwind {
+; CHECK-LABEL: test_cls_v8i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vcls.s8 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
+  %result = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a)
+  ret <8 x i8> %result
+}
+
+define <16 x i8> @test_cls_v16i8(<16 x i8> %a) nounwind {
+; CHECK-LABEL: test_cls_v16i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vcls.s8 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
+  %result = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a)
+  ret <16 x i8> %result
+}
+
+define <4 x i16> @test_cls_v4i16(<4 x i16> %a) nounwind {
+; CHECK-LABEL: test_cls_v4i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vcls.s16 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
+  %result = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a)
+  ret <4 x i16> %result
+}
+
+define <8 x i16> @test_cls_v8i16(<8 x i16> %a) nounwind {
+; CHECK-LABEL: test_cls_v8i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vcls.s16 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
+  %result = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a)
+  ret <8 x i16> %result
+}
+
+define <2 x i32> @test_cls_v2i32(<2 x i32> %a) nounwind {
+; CHECK-LABEL: test_cls_v2i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vcls.s32 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
+  %result = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a)
+  ret <2 x i32> %result
+}
+
+define <4 x i32> @test_cls_v4i32(<4 x i32> %a) nounwind {
+; CHECK-LABEL: test_cls_v4i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vcls.s32 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
+  %result = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a)
+  ret <4 x i32> %result
+}
+
+declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone