[llvm] [X86] Use an FP-based expansion for v4i32 ctlz on SSE2-only targets (PR #167034)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 7 14:47:15 PST 2025
https://github.com/NishiB137 created https://github.com/llvm/llvm-project/pull/167034
Fixes #161746
This pull request implements a new optimization for the `ISD::CTLZ` operation, as suggested in the issue. It uses a floating-point-based algorithm for `v4i32` vectors on x86 targets that have `SSE2` but do not have `SSSE3`.
1. A new generic function, `TargetLowering::expandCTLZWithFP`, was added. This function implements the `clz(x) = 31 - (bitcast<int>((float)x) >> 23) - 127` algorithm, complete with a `VSELECT` to handle the `clz(0) == 32` edge case.
2. The `X86TargetLowering::LowerVectorCTLZ` function was updated. It now checks for the `+sse2,-ssse3` feature combination and calls `expandCTLZWithFP` for `v4i32` vectors.
3. The new implementation was benchmarked against the existing fallback for `v4i32`: the generic integer `Expand` logic (for `SSE2`).
#### Benchmark Results (`llvm-mca` on `core2`)
The `llvm-mca` analysis from the testcases (`ctlz-v4i32-fp-1.ll`, `ctlz-v4i32-fp-2.ll`) confirms the benefit of this change.
| Implementation | Instructions <br/> (Per Iteration) | Total Cycles <br/> (Per Iteration) | IPC <br/> (Average) | Block RThroughput <br/> (Average) |
| :--- | :---: | :---: | :---: | :---: |
| Old `SSE2` (Integer Fallback) | **39** <br/> (3900 / 100) | **45.01** <br/> (4501 / 100) | 0.87 | 7.3 |
| **New `SSE2` (FP Fallback)** | **18** <br/> (1800 / 100) | **21.04** <br/> (2104 / 100) | 0.86 | 4.0 |
The llvm-mca result files:
[ctlz-v4i32-fp-1-after-mca.txt](https://github.com/user-attachments/files/23426769/ctlz-v4i32-fp-1-after-mca.txt)
[ctlz-v4i32-fp-1-before-mca.txt](https://github.com/user-attachments/files/23426770/ctlz-v4i32-fp-1-before-mca.txt)
[ctlz-v4i32-fp-2-after-mca.txt](https://github.com/user-attachments/files/23426771/ctlz-v4i32-fp-2-after-mca.txt)
[ctlz-v4i32-fp-2-before-mca.txt](https://github.com/user-attachments/files/23426772/ctlz-v4i32-fp-2-before-mca.txt)
>From 8b1bb2f67299480fc8553cd9997008b2188d3793 Mon Sep 17 00:00:00 2001
From: NishiB137 <cs23btech11041 at iith.ac.in>
Date: Sat, 8 Nov 2025 02:55:30 +0530
Subject: [PATCH 1/2] [CodeGen] Add expandCTLZWithFP helper to TargetLowering
supporting vXi32 types
---
llvm/include/llvm/CodeGen/TargetLowering.h | 5 ++
.../CodeGen/SelectionDAG/TargetLowering.cpp | 47 +++++++++++++++++++
2 files changed, 52 insertions(+)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 98565f423df3e..bb58f48cfdb5c 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5543,6 +5543,11 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
/// \returns The expansion result or SDValue() if it fails.
SDValue expandVPCTLZ(SDNode *N, SelectionDAG &DAG) const;
+ /// Expands a CTLZ node into a sequence of floating point operations.
+ /// \param N Node to expand
+ /// \returns The expansion result or SDValue() if it fails.
+ SDValue expandCTLZWithFP(SDNode *N, SelectionDAG &DAG) const;
+
/// Expand CTTZ via Table Lookup.
/// \param N Node to expand
/// \returns The expansion result or SDValue() if it fails.
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index b51d6649af2ec..d6ab6b2fe77e2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9480,6 +9480,53 @@ SDValue TargetLowering::expandVPCTLZ(SDNode *Node, SelectionDAG &DAG) const {
return DAG.getNode(ISD::VP_CTPOP, dl, VT, Op, Mask, VL);
}
+
+SDValue TargetLowering::expandCTLZWithFP(SDNode *Node, SelectionDAG &DAG) const {
+ SDLoc dl(Node);
+ SDValue Op = Node->getOperand(0);
+ EVT VT = Op.getValueType();
+
+ assert(VT.isVector() && "This expansion is intended for vectors");
+
+ EVT EltVT = VT.getVectorElementType();
+ EVT FloatVT, CmpVT;
+ unsigned BitWidth, MantissaBits, ExponentBias;
+
+ // Converting to float type
+ if (EltVT == MVT::i32) {
+ FloatVT = VT.changeVectorElementType(MVT::f32);
+ BitWidth = 32;
+ MantissaBits = 23;
+ ExponentBias = 127;
+ }
+ else {
+ return SDValue();
+ }
+
+ // Handling the case for when Op == 0 which is stored in ZeroRes
+ CmpVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+ SDValue Zero = DAG.getConstant(0, dl, VT);
+ SDValue IsZero = DAG.getSetCC(dl, CmpVT, Op, Zero, ISD::SETEQ);
+ SDValue ZeroRes = DAG.getConstant(BitWidth, dl, VT);
+
+ // Handling the case for Non-zero inputs using the algorithm mentioned below
+ SDValue Float = DAG.getNode(ISD::UINT_TO_FP, dl, FloatVT, Op);
+ SDValue FloatBits = DAG.getNode(ISD::BITCAST, dl, VT, Float);
+ SDValue Exp = DAG.getNode(ISD::SRL, dl, VT, FloatBits, DAG.getConstant(MantissaBits, dl, VT));
+ SDValue MSBIndex = DAG.getNode(ISD::SUB, dl, VT, Exp, DAG.getConstant(ExponentBias, dl, VT));
+ SDValue NonZeroRes = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(BitWidth - 1, dl, VT), MSBIndex);
+
+ //Returns the respective DAG Node based on the input being zero or non-zero
+ return DAG.getNode(ISD::VSELECT, dl, VT, IsZero, ZeroRes, NonZeroRes);
+
+ // pseudocode :
+ // if(x==0) return 32;
+ // float f = (float) x;
+ // int i = bitcast<int>(f);
+ // int ilog2 = (i >> 23) - 127;
+ // return 31 - ilog2;
+}
+
SDValue TargetLowering::CTTZTableLookup(SDNode *Node, SelectionDAG &DAG,
const SDLoc &DL, EVT VT, SDValue Op,
unsigned BitWidth) const {
>From e712022e8cc82f61b6d49bfade442f38dbdf0d42 Mon Sep 17 00:00:00 2001
From: VindhyaP312 <cs23btech11044 at iith.ac.in>
Date: Sat, 8 Nov 2025 03:21:35 +0530
Subject: [PATCH 2/2] [X86] Add SSE2 FP-based v4i32 CTLZ lowering and tests
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 13 +++++++++++
llvm/test/CodeGen/X86/ctlz-v4i32-fp-1.ll | 26 ++++++++++++++++++++++
llvm/test/CodeGen/X86/ctlz-v4i32-fp-2.ll | 28 ++++++++++++++++++++++++
llvm/test/CodeGen/X86/ctlz-v4i32-fp-3.ll | 23 +++++++++++++++++++
4 files changed, 90 insertions(+)
create mode 100644 llvm/test/CodeGen/X86/ctlz-v4i32-fp-1.ll
create mode 100644 llvm/test/CodeGen/X86/ctlz-v4i32-fp-2.ll
create mode 100644 llvm/test/CodeGen/X86/ctlz-v4i32-fp-3.ll
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 05a854a0bf3fa..bdea6c4734908 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1348,6 +1348,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SUB, MVT::i32, Custom);
}
+ if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2() &&
+ !Subtarget.hasSSSE3()) {
+ setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Custom);
+ }
+
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
@@ -29039,6 +29045,13 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
if (VT.is512BitVector() && !Subtarget.hasBWI())
return splitVectorIntUnary(Op, DAG, DL);
+ if (VT == MVT::v4i32 && Subtarget.hasSSE2() && !Subtarget.hasSSSE3()) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue New = TLI.expandCTLZWithFP(Op.getNode(), DAG);
+ if (New.getNode())
+ return New;
+ }
+
assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
}
diff --git a/llvm/test/CodeGen/X86/ctlz-v4i32-fp-1.ll b/llvm/test/CodeGen/X86/ctlz-v4i32-fp-1.ll
new file mode 100644
index 0000000000000..20467b3799875
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctlz-v4i32-fp-1.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-ssse3 -o - | FileCheck %s
+
+define <4 x i32> @test_v4i32_sse2(<4 x i32> %a) #0 {
+; CHECK-LABEL: test_v4i32_sse2:
+; CHECK: # %bb.0:
+
+; Zero test (strict CTLZ needs select)
+; CHECK-DAG: pcmpeqd %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
+
+; Exponent extraction + bias arithmetic (order-free)
+; CHECK-DAG: psrld {{\$}}23, %xmm{{[0-9]+}}
+; CHECK-DAG: psubd %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
+
+; Select/merge (could be por/pandn etc.)
+; CHECK: por %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
+
+; Must NOT use SSSE3 LUT path
+; CHECK-NOT: pshufb
+
+; CHECK: retq
+ %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
+ ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
+attributes #0 = { "optnone" }
diff --git a/llvm/test/CodeGen/X86/ctlz-v4i32-fp-2.ll b/llvm/test/CodeGen/X86/ctlz-v4i32-fp-2.ll
new file mode 100644
index 0000000000000..6949fe4110e58
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctlz-v4i32-fp-2.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-ssse3 -o - | FileCheck %s
+
+define <4 x i32> @test_v4i32_sse2_zero_undef(<4 x i32> %a) #0 {
+; CHECK-LABEL: test_v4i32_sse2_zero_undef:
+
+; zero check
+; CHECK-DAG: pcmpeqd
+
+; FP-based mantissa/exponent steps (order may vary)
+; CHECK-DAG: psrld $16
+; CHECK-DAG: subps
+; CHECK-DAG: psrld $23
+; CHECK-DAG: psubd
+
+; merge/select
+; CHECK: pandn
+; CHECK: por
+
+; CHECK-NOT: pshufb
+
+; CHECK: retq
+
+ %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true)
+ ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
+attributes #0 = { "optnone" }
diff --git a/llvm/test/CodeGen/X86/ctlz-v4i32-fp-3.ll b/llvm/test/CodeGen/X86/ctlz-v4i32-fp-3.ll
new file mode 100644
index 0000000000000..8d10c17223a21
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctlz-v4i32-fp-3.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 -o - | FileCheck %s
+
+; This verifies that **with SSSE3 enabled**, we use the LUT-based `pshufb`
+; implementation and *not* the floating-point exponent trick.
+
+define <4 x i32> @test_v4i32_ssse3(<4 x i32> %a) {
+; CHECK-LABEL: test_v4i32_ssse3:
+; CHECK: # %bb.0:
+
+; Must use SSSE3 table LUT:
+; CHECK: pshufb
+
+; Must NOT use FP exponent trick:
+; CHECK-NOT: cvtdq2ps
+; CHECK-NOT: psrld $23
+; CHECK-NOT: psubd
+
+; CHECK: retq
+ %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
+ ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
More information about the llvm-commits
mailing list