[llvm] [X86] ptest is commutable as long as only the Z flag is used. (PR #88969)

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Thu Apr 18 08:23:15 PDT 2024


https://github.com/topperc updated https://github.com/llvm/llvm-project/pull/88969

>From 3eeb6ca9634725f9450d615d03c775cc8a35cfe0 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Tue, 16 Apr 2024 12:22:54 -0700
Subject: [PATCH 1/4] [X86] ptest is commutable as long as only the Z flag is
 used.

Fixes #88958.
---
 llvm/lib/Target/X86/X86InstrSSE.td            | 18 +++++++++++
 llvm/test/CodeGen/X86/combine-ptest.ll        |  6 ++--
 llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll | 32 +++++++++++--------
 3 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 5d799fc00df92b..385d65d0bcaaf6 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -5688,6 +5688,13 @@ let Predicates = [UseSSE41, OptForSize] in {
 // SSE4.1 - Packed Bit Test
 //===----------------------------------------------------------------------===//
 
+// ptest is commutable if only the Z flag is used. If the C flag is used,
+// commuting would change which operand is inverted.
+def X86ptest_commutable : PatFrag<(ops node:$src1, node:$src2),
+                                  (X86ptest node:$src1, node:$src2), [{
+  return onlyUsesZeroFlag(SDValue(Node, 0));
+}]>;
+
 // ptest instruction we'll lower to this in X86ISelLowering primarily from
 // the intel intrinsic that corresponds to this.
 let Defs = [EFLAGS], Predicates = [HasAVX] in {
@@ -5723,6 +5730,17 @@ def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
               Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
 }
 
+let Predicates = [HasAVX] in {
+  def : Pat<(X86ptest_commutable (loadv2i64 addr:$src2), VR128:$src1),
+            (VPTESTrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86ptest_commutable (loadv4i64 addr:$src2), VR256:$src1),
+            (VPTESTYrm VR256:$src1, addr:$src2)>;
+}
+let Predicates = [UseSSE41] in {
+  def : Pat<(X86ptest_commutable (memopv2i64 addr:$src2), VR128:$src1),
+            (PTESTrm VR128:$src1, addr:$src2)>;
+}
+
 // The bit test instructions below are AVX only
 multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
                        X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
diff --git a/llvm/test/CodeGen/X86/combine-ptest.ll b/llvm/test/CodeGen/X86/combine-ptest.ll
index 3a695bfc6234db..40f7899c2ac95f 100644
--- a/llvm/test/CodeGen/X86/combine-ptest.ll
+++ b/llvm/test/CodeGen/X86/combine-ptest.ll
@@ -400,17 +400,15 @@ define i1 @PR38788(<4 x i32> %0, <4 x i32> %1) {
 define i32 @PR88958_1(ptr %0, <2 x i64> %1) {
 ; SSE-LABEL: PR88958_1:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm1
 ; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    ptest %xmm0, %xmm1
+; SSE-NEXT:    ptest (%rdi), %xmm0
 ; SSE-NEXT:    sete %al
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: PR88958_1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    vptest %xmm0, %xmm1
+; AVX-NEXT:    vptest (%rdi), %xmm0
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
   %3 = load <2 x i64>, ptr %0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
index f8ba00b0332994..9cd0f4d12e15ab 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
@@ -1018,32 +1018,38 @@ define zeroext i1 @PR44781(ptr %0) {
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: PR44781:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX1OR2-NEXT:    vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; AVX1OR2-NEXT:    sete %al
-; AVX1OR2-NEXT:    retq
+; AVX1-LABEL: PR44781:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm0 = [15,15,15,15]
+; AVX1-NEXT:    vptest (%rdi), %xmm0
+; AVX1-NEXT:    sete %al
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: PR44781:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [15,15,15,15]
+; AVX2-NEXT:    vptest (%rdi), %xmm0
+; AVX2-NEXT:    sete %al
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: PR44781:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX512F-NEXT:    vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [15,15,15,15]
+; AVX512F-NEXT:    vptest (%rdi), %xmm0
 ; AVX512F-NEXT:    sete %al
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: PR44781:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX512BW-NEXT:    vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [15,15,15,15]
+; AVX512BW-NEXT:    vptest (%rdi), %xmm0
 ; AVX512BW-NEXT:    sete %al
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: PR44781:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX512BWVL-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [64424509455,64424509455]
-; AVX512BWVL-NEXT:    vptest %xmm1, %xmm0
+; AVX512BWVL-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [64424509455,64424509455]
+; AVX512BWVL-NEXT:    vptest (%rdi), %xmm0
 ; AVX512BWVL-NEXT:    sete %al
 ; AVX512BWVL-NEXT:    retq
   %2 = load <4 x i32>, ptr %0, align 4

>From 87e4444aab3c380e9b41fe2e111eb2ca79aacb1c Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Wed, 17 Apr 2024 17:07:13 -0700
Subject: [PATCH 2/4] fixup! add TESTPS/PD. Need tests.

---
 llvm/lib/Target/X86/X86InstrSSE.td | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 385d65d0bcaaf6..7974ed6cc4ea1b 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -5755,6 +5755,13 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
             Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
 }
 
+// ptest is commutable if only the Z flag is used. If the C flag is used,
+// commuting would change which operand is inverted.
+def X86testp_commutable : PatFrag<(ops node:$src1, node:$src2),
+                                  (X86testp node:$src1, node:$src2), [{
+  return onlyUsesZeroFlag(SDValue(Node, 0));
+}]>;
+
 let Defs = [EFLAGS], Predicates = [HasAVX] in {
 let ExeDomain = SSEPackedSingle in {
 defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
@@ -5770,6 +5777,18 @@ defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
 }
 }
 
+let Predicates = [HasAVX] in {
+  def : Pat<(X86testp_commutable (loadv4f32 addr:$src2), VR128:$src),
+            (VTESTPSrm VR128:$src, addr:$src2)>;
+  def : Pat<(X86testp_commutable (loadv8f32 addr:$src2), VR256:$src),
+            (VTESTPSYrm VR256:$src, addr:$src2)>;
+
+  def : Pat<(X86testp_commutable (loadv2f64 addr:$src2), VR128:$src),
+            (VTESTPDrm VR128:$src, addr:$src2)>;
+  def : Pat<(X86testp_commutable (loadv4f64 addr:$src2), VR256:$src),
+            (VTESTPDYrm VR256:$src, addr:$src2)>;
+}
+
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Misc Instructions
 //===----------------------------------------------------------------------===//

>From 6061670ba7489c9b292243dc03ee1baac17f9166 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Wed, 17 Apr 2024 17:29:37 -0700
Subject: [PATCH 3/4] fixup! tests.

---
 llvm/test/CodeGen/X86/combine-testpd.ll | 82 +++++++++++++++++++++++++
 llvm/test/CodeGen/X86/combine-testps.ll | 82 +++++++++++++++++++++++++
 2 files changed, 164 insertions(+)

diff --git a/llvm/test/CodeGen/X86/combine-testpd.ll b/llvm/test/CodeGen/X86/combine-testpd.ll
index c3e34f963ca986..f634e9443ba71c 100644
--- a/llvm/test/CodeGen/X86/combine-testpd.ll
+++ b/llvm/test/CodeGen/X86/combine-testpd.ll
@@ -255,6 +255,88 @@ end: ; preds = %entry
   ret void
 }
 
+define i32 @PR88958_1(ptr %0, <2 x double> %1) {
+; SSE-LABEL: PR88958_1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    ptest (%rdi), %xmm0
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    retq
+;
+; CHECK-LABEL: PR88958_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    vtestpd (%rdi), %xmm0
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %3 = load <2 x double>, ptr %0
+  %4 = tail call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %3, <2 x double> %1)
+  ret i32 %4
+}
+
+define i32 @PR88958_2(ptr %0, <2 x double> %1) {
+; SSE-LABEL: PR88958_2:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa (%rdi), %xmm1
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    ptest %xmm0, %xmm1
+; SSE-NEXT:    setb %al
+; SSE-NEXT:    retq
+;
+; CHECK-LABEL: PR88958_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovapd (%rdi), %xmm1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    vtestpd %xmm0, %xmm1
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    retq
+  %3 = load <2 x double>, ptr %0
+  %4 = tail call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %3, <2 x double> %1)
+  ret i32 %4
+}
+
+define i32 @PR88958_3(ptr %0, <4 x double> %1) {
+; SSE-LABEL: PR88958_1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    ptest (%rdi), %xmm0
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    retq
+;
+; CHECK-LABEL: PR88958_3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    vtestpd (%rdi), %ymm0
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %3 = load <4 x double>, ptr %0
+  %4 = tail call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %3, <4 x double> %1)
+  ret i32 %4
+}
+
+define i32 @PR88958_4(ptr %0, <4 x double> %1) {
+; SSE-LABEL: PR88958_2:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa (%rdi), %xmm1
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    ptest %xmm0, %xmm1
+; SSE-NEXT:    setb %al
+; SSE-NEXT:    retq
+;
+; CHECK-LABEL: PR88958_4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovapd (%rdi), %ymm1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    vtestpd %ymm0, %ymm1
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %3 = load <4 x double>, ptr %0
+  %4 = tail call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %3, <4 x double> %1)
+  ret i32 %4
+}
+
 declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnone
 declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone
 declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/combine-testps.ll b/llvm/test/CodeGen/X86/combine-testps.ll
index 43dddbecf51a7d..e2927435f424d1 100644
--- a/llvm/test/CodeGen/X86/combine-testps.ll
+++ b/llvm/test/CodeGen/X86/combine-testps.ll
@@ -254,6 +254,88 @@ end: ; preds = %entry
   ret void
 }
 
+define i32 @PR88958_1(ptr %0, <4 x float> %1) {
+; SSE-LABEL: PR88958_1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    ptest (%rdi), %xmm0
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    retq
+;
+; CHECK-LABEL: PR88958_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    vtestps (%rdi), %xmm0
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %3 = load <4 x float>, ptr %0
+  %4 = tail call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %3, <4 x float> %1)
+  ret i32 %4
+}
+
+define i32 @PR88958_2(ptr %0, <4 x float> %1) {
+; SSE-LABEL: PR88958_2:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa (%rdi), %xmm1
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    ptest %xmm0, %xmm1
+; SSE-NEXT:    setb %al
+; SSE-NEXT:    retq
+;
+; CHECK-LABEL: PR88958_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps (%rdi), %xmm1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    vtestps %xmm0, %xmm1
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    retq
+  %3 = load <4 x float>, ptr %0
+  %4 = tail call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %3, <4 x float> %1)
+  ret i32 %4
+}
+
+define i32 @PR88958_3(ptr %0, <8 x float> %1) {
+; SSE-LABEL: PR88958_1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    ptest (%rdi), %xmm0
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    retq
+;
+; CHECK-LABEL: PR88958_3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    vtestps (%rdi), %ymm0
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %3 = load <8 x float>, ptr %0
+  %4 = tail call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %3, <8 x float> %1)
+  ret i32 %4
+}
+
+define i32 @PR88958_4(ptr %0, <8 x float> %1) {
+; SSE-LABEL: PR88958_2:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa (%rdi), %xmm1
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    ptest %xmm0, %xmm1
+; SSE-NEXT:    setb %al
+; SSE-NEXT:    retq
+;
+; CHECK-LABEL: PR88958_4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps (%rdi), %ymm1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    vtestps %ymm0, %ymm1
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %3 = load <8 x float>, ptr %0
+  %4 = tail call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %3, <8 x float> %1)
+  ret i32 %4
+}
+
 declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone
 declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
 declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnone

>From ac4729416ceb0c0e540db07243e4bd167a4591ea Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Thu, 18 Apr 2024 08:19:28 -0700
Subject: [PATCH 4/4] fixup! Update comment

---
 llvm/lib/Target/X86/X86InstrSSE.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 7974ed6cc4ea1b..adbf0a2cbb2471 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -5755,8 +5755,8 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
             Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
 }
 
-// ptest is commutable if only the Z flag is used. If the C flag is used,
-// commuting would change which operand is inverted.
+// testps/testpd are commutable if only the Z flag is used. If the C flag is
+// used, commuting would change which operand is inverted.
 def X86testp_commutable : PatFrag<(ops node:$src1, node:$src2),
                                   (X86testp node:$src1, node:$src2), [{
   return onlyUsesZeroFlag(SDValue(Node, 0));



More information about the llvm-commits mailing list