[llvm] [X86] Combine `PTEST` to `TESTP` (PR #157249)

Sat Sep 6 04:02:16 PDT 2025

https://github.com/abhishek-kaushik22 created https://github.com/llvm/llvm-project/pull/157249

Combine `PTEST` to `TESTP` if only sign bit tested.

Discovered in #156233

>From 3d2fff76ff7d14e1dd0aff0385aa69fd9c786d5f Mon Sep 17 00:00:00 2001
From: Abhishek Kaushik <abhishek.kaushik at intel.com>
Date: Sat, 6 Sep 2025 16:30:20 +0530
Subject: [PATCH] [X86] Combine `PTEST` to `TESTP`

Combine `PTEST` to `TESTP` if only sign bit tested.

Discovered in #156233
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  43 +++
 llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll |  56 +---
 llvm/test/CodeGen/combine-ptest-to-testp.ll   | 281 ++++++++++++++++++
 3 files changed, 336 insertions(+), 44 deletions(-)
 create mode 100644 llvm/test/CodeGen/combine-ptest-to-testp.ll

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ab21cf534b304..c95fd00828bcf 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48624,6 +48624,45 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
   return SDValue();
 }
 
+static SDValue canFoldToTESTP(SDValue Val, const SDLoc &DL, const EVT PTestVT,
+                              SelectionDAG &DAG,
+                              const X86Subtarget &Subtarget) {
+  if (!Subtarget.hasAVX())
+    return SDValue();
+
+  EVT VT = Val.getValueType();
+  unsigned EltBits = VT.getScalarSizeInBits();
+
+  if (EltBits != 32 && EltBits != 64)
+    return SDValue();
+
+  SDValue Op0 = Val.getOperand(0);
+  SDValue Op1 = Val.getOperand(1);
+
+  MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
+  MVT FloatVT = MVT::getVectorVT(FloatSVT, VT.getVectorNumElements());
+
+  // (ptest (and Op0, splat(minSignedVal)), (and Op0, splat(minSignedVal))) ->
+  // (testp Op0, Op0)
+  APInt Splat;
+  if (ISD::isConstantSplatVector(Op1.getNode(), Splat) &&
+      Splat.getBitWidth() == EltBits && Splat.isMinSignedValue()) {
+    SDValue FpOp0 = DAG.getBitcast(FloatVT, Op0);
+    return DAG.getNode(X86ISD::TESTP, DL, PTestVT, FpOp0, FpOp0);
+  }
+
+  // (ptest (and (and Op0, splat(minSignedVal), Op1), ...)) -> (testp Op0, Op1)
+  if (Op0.getOpcode() == ISD::AND &&
+      ISD::isConstantSplatVector(Op0.getOperand(1).getNode(), Splat) &&
+      Splat.getBitWidth() == EltBits && Splat.isMinSignedValue()) {
+    SDValue FpOp0 = DAG.getBitcast(FloatVT, Op0.getOperand(0));
+    SDValue FpOp1 = DAG.getBitcast(FloatVT, Op1);
+    return DAG.getNode(X86ISD::TESTP, DL, PTestVT, FpOp0, FpOp1);
+  }
+
+  return SDValue();
+}
+
 /// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
 /// to avoid the inversion.
 static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
@@ -48718,6 +48757,10 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
       SDValue BC = peekThroughBitcasts(Op0);
       EVT BCVT = BC.getValueType();
 
+      if (EFLAGS.getOpcode() == X86ISD::PTEST && BC.getOpcode() == ISD::AND)
+        if (SDValue V = canFoldToTESTP(BC, SDLoc(EFLAGS), VT, DAG, Subtarget))
+          return V;
+
       // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
       if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
         return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
index 227e000c6be7f..2c7399a1a1fad 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
@@ -875,28 +875,12 @@ define i1 @mask_v8i32(<8 x i32> %a0) {
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: mask_v8i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0
-; AVX1-NEXT:    sete %al
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: mask_v8i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456]
-; AVX2-NEXT:    vptest %ymm1, %ymm0
-; AVX2-NEXT:    sete %al
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: mask_v8i32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456]
-; AVX512-NEXT:    vptest %ymm1, %ymm0
-; AVX512-NEXT:    sete %al
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: mask_v8i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vtestps %ymm0, %ymm0
+; AVX-NEXT:    sete %al
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
   %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0)
   %2 = and i32 %1, 2147483648
   %3 = icmp eq i32 %2, 0
@@ -965,28 +949,12 @@ define i1 @signtest_v8i32(<8 x i32> %a0) {
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: signtest_v8i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0
-; AVX1-NEXT:    sete %al
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: signtest_v8i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456]
-; AVX2-NEXT:    vptest %ymm1, %ymm0
-; AVX2-NEXT:    sete %al
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: signtest_v8i32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456]
-; AVX512-NEXT:    vptest %ymm1, %ymm0
-; AVX512-NEXT:    sete %al
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: signtest_v8i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vtestps %ymm0, %ymm0
+; AVX-NEXT:    sete %al
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
   %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0)
   %2 = icmp sgt i32 %1, -1
   ret i1 %2
diff --git a/llvm/test/CodeGen/combine-ptest-to-testp.ll b/llvm/test/CodeGen/combine-ptest-to-testp.ll
new file mode 100644
index 0000000000000..7c8595f2dd756
--- /dev/null
+++ b/llvm/test/CodeGen/combine-ptest-to-testp.ll
@@ -0,0 +1,281 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2  | FileCheck %s
+
+define void @combine_ptest_to_vtestps_1(<4 x i32> noundef %a) {
+; CHECK-LABEL: combine_ptest_to_vtestps_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vtestps %xmm0, %xmm0
+; CHECK-NEXT:    jne foo at PLT # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %and = and <4 x i32> %a, splat (i32 -2147483648)
+  %rdx.or = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %and)
+  %cmp.not = icmp eq i32 %rdx.or, 0
+  br i1 %cmp.not, label %if.end, label %if.then
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @combine_ptest_to_vtestps_2(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: combine_ptest_to_vtestps_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vtestps %xmm1, %xmm0
+; CHECK-NEXT:    jne foo at PLT # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %and = and <4 x i32> %a, splat (i32 -2147483648)
+  %and1 = and <4 x i32> %and, %b
+  %rdx.or = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %and1)
+  %cmp.not = icmp eq i32 %rdx.or, 0
+  br i1 %cmp.not, label %if.end, label %if.then
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @combine_ptest_to_vtestps_3(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: combine_ptest_to_vtestps_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vtestps %xmm1, %xmm0
+; CHECK-NEXT:    jae foo at PLT # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %not = and <4 x i32> %a, splat (i32 -2147483648)
+  %and = xor <4 x i32> %not, splat (i32 -2147483648)
+  %and1 = and <4 x i32> %and, %b
+  %rdx.or = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %and1)
+  %cmp.not = icmp eq i32 %rdx.or, 0
+  br i1 %cmp.not, label %if.end, label %if.then
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @combine_ptest_to_vtestps_4(<8 x i32> noundef %a) {
+; CHECK-LABEL: combine_ptest_to_vtestps_4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vtestps %ymm0, %ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    jne foo at PLT # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %and = and <8 x i32> %a, splat (i32 -2147483648)
+  %rdx.or = tail call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %and)
+  %cmp.not = icmp eq i32 %rdx.or, 0
+  br i1 %cmp.not, label %if.end, label %if.then
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @combine_ptest_to_vtestps_5(<8 x i32> noundef %a, <8 x i32> noundef %b) {
+; CHECK-LABEL: combine_ptest_to_vtestps_5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vtestps %ymm1, %ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    jne foo at PLT # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %and = and <8 x i32> %a, splat (i32 -2147483648)
+  %and1 = and <8 x i32> %and, %b
+  %rdx.or = tail call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %and1)
+  %cmp.not = icmp eq i32 %rdx.or, 0
+  br i1 %cmp.not, label %if.end, label %if.then
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @combine_ptest_to_vtestps_6(<8 x i32> noundef %a, <8 x i32> noundef %b) {
+; CHECK-LABEL: combine_ptest_to_vtestps_6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vtestps %ymm1, %ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    jae foo at PLT # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %not = and <8 x i32> %a, splat (i32 -2147483648)
+  %and = xor <8 x i32> %not, splat (i32 -2147483648)
+  %and1 = and <8 x i32> %and, %b
+  %rdx.or = tail call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %and1)
+  %cmp.not = icmp eq i32 %rdx.or, 0
+  br i1 %cmp.not, label %if.end, label %if.then
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @combine_ptest_to_vtestpd_1(<2 x i64> noundef %a) {
+; CHECK-LABEL: combine_ptest_to_vtestpd_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vtestpd %xmm0, %xmm0
+; CHECK-NEXT:    jne foo at PLT # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %and = and <2 x i64> %a, splat (i64 -9223372036854775808)
+  %rdx.or = tail call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %and)
+  %cmp.not = icmp eq i64 %rdx.or, 0
+  br i1 %cmp.not, label %if.end, label %if.then
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @combine_ptest_to_vtestpd_2(<2 x i64> noundef %a, <2 x i64> noundef %b) {
+; CHECK-LABEL: combine_ptest_to_vtestpd_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vtestpd %xmm1, %xmm0
+; CHECK-NEXT:    jne foo at PLT # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %and = and <2 x i64> %a, splat (i64 -9223372036854775808)
+  %and1 = and <2 x i64> %and, %b
+  %rdx.or = tail call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %and1)
+  %cmp.not = icmp eq i64 %rdx.or, 0
+  br i1 %cmp.not, label %if.end, label %if.then
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @combine_ptest_to_vtestpd_3(<2 x i64> noundef %a, <2 x i64> noundef %b) {
+; CHECK-LABEL: combine_ptest_to_vtestpd_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    vptest %xmm1, %xmm0
+; CHECK-NEXT:    jne foo at PLT # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %not = and <2 x i64> %a, splat (i64 -9223372036854775808)
+  %and = xor <2 x i64> %not, splat (i64 -9223372036854775808)
+  %and1 = and <2 x i64> %and, %b
+  %rdx.or = tail call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %and1)
+  %cmp.not = icmp eq i64 %rdx.or, 0
+  br i1 %cmp.not, label %if.end, label %if.then
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @combine_ptest_to_vtestpd_4(<4 x i64> noundef %a) {
+; CHECK-LABEL: combine_ptest_to_vtestpd_4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vtestpd %ymm0, %ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    jne foo at PLT # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %and = and <4 x i64> %a, splat (i64 -9223372036854775808)
+  %rdx.or = tail call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %and)
+  %cmp.not = icmp eq i64 %rdx.or, 0
+  br i1 %cmp.not, label %if.end, label %if.then
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @combine_ptest_to_vtestpd_5(<4 x i64> noundef %a, <4 x i64> noundef %b) {
+; CHECK-LABEL: combine_ptest_to_vtestpd_5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vtestpd %ymm1, %ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    jne foo at PLT # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %and = and <4 x i64> %a, splat (i64 -9223372036854775808)
+  %and1 = and <4 x i64> %and, %b
+  %rdx.or = tail call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %and1)
+  %cmp.not = icmp eq i64 %rdx.or, 0
+  br i1 %cmp.not, label %if.end, label %if.then
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @combine_ptest_to_vtestpd_6(<4 x i64> noundef %a, <4 x i64> noundef %b) {
+; CHECK-LABEL: combine_ptest_to_vtestpd_6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; CHECK-NEXT:    vpandn %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vptest %ymm1, %ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    jne foo at PLT # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %not = and <4 x i64> %a, splat (i64 -9223372036854775808)
+  %and = xor <4 x i64> %not, splat (i64 -9223372036854775808)
+  %and1 = and <4 x i64> %and, %b
+  %rdx.or = tail call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %and1)
+  %cmp.not = icmp eq i64 %rdx.or, 0
+  br i1 %cmp.not, label %if.end, label %if.then
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+declare void @foo()
+declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>)
+declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>)