[llvm] [X86] combinePTESTCC - always prefer TESTPS/D to PTEST on AVX (PR #174097)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 1 04:50:15 PST 2026
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/174097
>From 2e81de3188b49a7c6a9aacb03abe2dda9ff613ed Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 31 Dec 2025 14:42:50 +0000
Subject: [PATCH] [X86] combinePTESTCC - always prefer TESTPS/D to PTEST on AVX
If the elements are sign-bit splats AVX targets can always use TESTPS/D directly, potentially allowing further simplification.
Many Intel targets have slightly lower tp/uops requirements for TESTPS/D vs PTEST - AMD is neutral.
Fixes the AVX1 `testz(ashr(X,bw-1),-1)` codegen for the `okD` testcase from #156233
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 18 ++++++++-------
llvm/test/CodeGen/X86/combine-ptest-256.ll | 26 ++++++----------------
2 files changed, 17 insertions(+), 27 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 20136ade7c317..f45bb3094037c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -49013,24 +49013,26 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
// to more efficiently extract the sign bits and compare that.
// TODO: Handle TESTC with comparison inversion.
// TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
- // TESTP/MOVMSK combines to make sure its never worse than PTEST?
+ // MOVMSK combines to make sure its never worse than PTEST?
if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
unsigned EltBits = BCVT.getScalarSizeInBits();
if (DAG.ComputeNumSignBits(BC) == EltBits) {
assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
+ if ((EltBits == 32 || EltBits == 64) &&
+ EFLAGS.getOpcode() != X86ISD::TESTP && Subtarget.hasAVX()) {
+ MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
+ MVT FloatVT =
+ MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
+ BC = DAG.getBitcast(FloatVT, DAG.getFreeze(BC));
+ return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, BC, BC);
+ }
APInt SignMask = APInt::getSignMask(EltBits);
if (SDValue Res =
TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
// For vXi16 cases we need to use pmovmksb and extract every other
// sign bit.
SDLoc DL(EFLAGS);
- if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
- MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
- MVT FloatVT =
- MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
- Res = DAG.getBitcast(FloatVT, DAG.getFreeze(Res));
- return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
- } else if (EltBits == 16) {
+ if (EltBits == 16) {
MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
Res = DAG.getBitcast(MovmskVT, Res);
Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
diff --git a/llvm/test/CodeGen/X86/combine-ptest-256.ll b/llvm/test/CodeGen/X86/combine-ptest-256.ll
index 2612fad16db63..a071da0ecc4e9 100644
--- a/llvm/test/CodeGen/X86/combine-ptest-256.ll
+++ b/llvm/test/CodeGen/X86/combine-ptest-256.ll
@@ -203,25 +203,13 @@ define i32 @ptestz_256_allones1(<4 x i64> %c, i32 %a, i32 %b) {
;
define i32 @ptestz_v8i32_signbits(<8 x i32> %c, i32 %a, i32 %b) {
-; AVX1-LABEL: ptestz_v8i32_signbits:
-; AVX1: # %bb.0:
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vptest %ymm0, %ymm0
-; AVX1-NEXT: cmovnel %esi, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: ptestz_v8i32_signbits:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: vtestps %ymm0, %ymm0
-; AVX2-NEXT: cmovnel %esi, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; CHECK-LABEL: ptestz_v8i32_signbits:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: vtestps %ymm0, %ymm0
+; CHECK-NEXT: cmovnel %esi, %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%t1 = ashr <8 x i32> %c, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
%t2 = bitcast <8 x i32> %t1 to <4 x i64>
%t3 = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %t2, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>)
More information about the llvm-commits
mailing list