[llvm] [SDAG] Simplify is-power-of-2 codegen (PR #72275)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 14 07:55:16 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
@llvm/pr-subscribers-llvm-selectiondag
Author: Tavian Barnes (tavianator)
<details>
<summary>Changes</summary>
When x is not known to be nonzero, ctpop(x) == 1 is expanded to
x != 0 && (x & (x - 1)) == 0
resulting in codegen like
leal -1(%rdi), %eax
testl %eax, %edi
sete %cl
testl %edi, %edi
setne %al
andb %cl, %al
But another expression that works is
(x ^ (x - 1)) > x - 1
which has nicer codegen:
leal -1(%rdi), %eax
xorl %eax, %edi
cmpl %eax, %edi
seta %al
---
Patch is 104.71 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/72275.diff
6 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+14-11)
- (modified) llvm/test/CodeGen/X86/ctpop-combine.ll (+30-37)
- (modified) llvm/test/CodeGen/X86/ispow2.ll (+36-53)
- (modified) llvm/test/CodeGen/X86/vector-popcnt-128.ll (+367-417)
- (modified) llvm/test/CodeGen/X86/vector-popcnt-256.ll (+256-354)
- (modified) llvm/test/CodeGen/X86/vector-popcnt-512.ll (+112-146)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index ed352c86eca06e5..c47e2ad418a0e8e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4237,9 +4237,7 @@ static SDValue simplifySetCCWithCTPOP(const TargetLowering &TLI, EVT VT,
return DAG.getSetCC(dl, VT, Result, DAG.getConstant(0, dl, CTVT), CC);
}
- // Expand a power-of-2 comparison based on ctpop:
- // (ctpop x) == 1 --> (x != 0) && ((x & x-1) == 0)
- // (ctpop x) != 1 --> (x == 0) || ((x & x-1) != 0)
+ // Expand a power-of-2 comparison based on ctpop
if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && C1 == 1) {
// Keep the CTPOP if it is cheap.
if (TLI.isCtpopFast(CTVT))
@@ -4248,17 +4246,22 @@ static SDValue simplifySetCCWithCTPOP(const TargetLowering &TLI, EVT VT,
SDValue Zero = DAG.getConstant(0, dl, CTVT);
SDValue NegOne = DAG.getAllOnesConstant(dl, CTVT);
assert(CTVT.isInteger());
- ISD::CondCode InvCond = ISD::getSetCCInverse(Cond, CTVT);
SDValue Add = DAG.getNode(ISD::ADD, dl, CTVT, CTOp, NegOne);
- SDValue And = DAG.getNode(ISD::AND, dl, CTVT, CTOp, Add);
- SDValue RHS = DAG.getSetCC(dl, VT, And, Zero, Cond);
// Its not uncommon for known-never-zero X to exist in (ctpop X) eq/ne 1, so
- // check before the emit a potentially unnecessary op.
- if (DAG.isKnownNeverZero(CTOp))
+ // check before emitting a potentially unnecessary op.
+ if (DAG.isKnownNeverZero(CTOp)) {
+ // (ctpop x) == 1 --> (x & x-1) == 0
+ // (ctpop x) != 1 --> (x & x-1) != 0
+ SDValue And = DAG.getNode(ISD::AND, dl, CTVT, CTOp, Add);
+ SDValue RHS = DAG.getSetCC(dl, VT, And, Zero, Cond);
return RHS;
- SDValue LHS = DAG.getSetCC(dl, VT, CTOp, Zero, InvCond);
- unsigned LogicOpcode = Cond == ISD::SETEQ ? ISD::AND : ISD::OR;
- return DAG.getNode(LogicOpcode, dl, VT, LHS, RHS);
+ } else {
+ // (ctpop x) == 1 --> (x ^ x-1) > x-1
+ // (ctpop x) != 1 --> (x ^ x-1) <= x-1
+ SDValue Xor = DAG.getNode(ISD::XOR, dl, CTVT, CTOp, Add);
+ ISD::CondCode CmpCond = Cond == ISD::SETEQ ? ISD::SETUGT : ISD::SETULE;
+ return DAG.getSetCC(dl, VT, Xor, Add, CmpCond);
+ }
}
return SDValue();
diff --git a/llvm/test/CodeGen/X86/ctpop-combine.ll b/llvm/test/CodeGen/X86/ctpop-combine.ll
index a33319e66d5f111..fba44218e05726b 100644
--- a/llvm/test/CodeGen/X86/ctpop-combine.ll
+++ b/llvm/test/CodeGen/X86/ctpop-combine.ll
@@ -120,13 +120,11 @@ define i32 @ctpop_eq_one(i64 %x) nounwind readnone {
;
; NO-POPCOUNT-LABEL: ctpop_eq_one:
; NO-POPCOUNT: # %bb.0:
-; NO-POPCOUNT-NEXT: leaq -1(%rdi), %rax
-; NO-POPCOUNT-NEXT: testq %rax, %rdi
-; NO-POPCOUNT-NEXT: sete %al
-; NO-POPCOUNT-NEXT: testq %rdi, %rdi
-; NO-POPCOUNT-NEXT: setne %cl
-; NO-POPCOUNT-NEXT: andb %al, %cl
-; NO-POPCOUNT-NEXT: movzbl %cl, %eax
+; NO-POPCOUNT-NEXT: leaq -1(%rdi), %rcx
+; NO-POPCOUNT-NEXT: xorq %rcx, %rdi
+; NO-POPCOUNT-NEXT: xorl %eax, %eax
+; NO-POPCOUNT-NEXT: cmpq %rcx, %rdi
+; NO-POPCOUNT-NEXT: seta %al
; NO-POPCOUNT-NEXT: retq
%count = tail call i64 @llvm.ctpop.i64(i64 %x)
%cmp = icmp eq i64 %count, 1
@@ -145,13 +143,11 @@ define i32 @ctpop_ne_one(i64 %x) nounwind readnone {
;
; NO-POPCOUNT-LABEL: ctpop_ne_one:
; NO-POPCOUNT: # %bb.0:
-; NO-POPCOUNT-NEXT: leaq -1(%rdi), %rax
-; NO-POPCOUNT-NEXT: testq %rax, %rdi
-; NO-POPCOUNT-NEXT: setne %al
-; NO-POPCOUNT-NEXT: testq %rdi, %rdi
-; NO-POPCOUNT-NEXT: sete %cl
-; NO-POPCOUNT-NEXT: orb %al, %cl
-; NO-POPCOUNT-NEXT: movzbl %cl, %eax
+; NO-POPCOUNT-NEXT: leaq -1(%rdi), %rcx
+; NO-POPCOUNT-NEXT: xorq %rcx, %rdi
+; NO-POPCOUNT-NEXT: xorl %eax, %eax
+; NO-POPCOUNT-NEXT: cmpq %rcx, %rdi
+; NO-POPCOUNT-NEXT: setbe %al
; NO-POPCOUNT-NEXT: retq
%count = tail call i64 @llvm.ctpop.i64(i64 %x)
%cmp = icmp ne i64 %count, 1
@@ -162,29 +158,26 @@ define i32 @ctpop_ne_one(i64 %x) nounwind readnone {
define i1 @ctpop_trunc_non_power2(i255 %x) nounwind {
; CHECK-LABEL: ctpop_trunc_non_power2:
; CHECK: # %bb.0:
-; CHECK-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT: movq %rcx, %r8
-; CHECK-NEXT: andq %rax, %r8
-; CHECK-NEXT: movq %rdi, %r9
-; CHECK-NEXT: addq $-1, %r9
-; CHECK-NEXT: movq %rsi, %r10
-; CHECK-NEXT: adcq $-1, %r10
-; CHECK-NEXT: movq %rdx, %r11
-; CHECK-NEXT: adcq $-1, %r11
-; CHECK-NEXT: adcq %rax, %rcx
-; CHECK-NEXT: andq %rdi, %r9
-; CHECK-NEXT: andq %rdx, %r11
-; CHECK-NEXT: orq %r9, %r11
-; CHECK-NEXT: andq %r8, %rcx
-; CHECK-NEXT: andq %rsi, %r10
-; CHECK-NEXT: orq %rcx, %r10
-; CHECK-NEXT: orq %r11, %r10
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: orq %rdx, %rdi
-; CHECK-NEXT: orq %rsi, %r8
-; CHECK-NEXT: orq %rdi, %r8
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: andb %cl, %al
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: addq $-1, %rax
+; CHECK-NEXT: movq %rsi, %r8
+; CHECK-NEXT: adcq $-1, %r8
+; CHECK-NEXT: movq %rdx, %r9
+; CHECK-NEXT: adcq $-1, %r9
+; CHECK-NEXT: movabsq $9223372036854775807, %r10 # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT: movq %rcx, %r11
+; CHECK-NEXT: adcq %r10, %r11
+; CHECK-NEXT: xorq %r11, %rcx
+; CHECK-NEXT: andq %r10, %r11
+; CHECK-NEXT: andq %r10, %rcx
+; CHECK-NEXT: xorq %r9, %rdx
+; CHECK-NEXT: xorq %r8, %rsi
+; CHECK-NEXT: xorq %rax, %rdi
+; CHECK-NEXT: cmpq %rdi, %rax
+; CHECK-NEXT: sbbq %rsi, %r8
+; CHECK-NEXT: sbbq %rdx, %r9
+; CHECK-NEXT: sbbq %rcx, %r11
+; CHECK-NEXT: setb %al
; CHECK-NEXT: retq
%a = call i255 @llvm.ctpop.i255(i255 %x)
%b = trunc i255 %a to i8 ; largest value from ctpop is 255, fits in 8 bits.
diff --git a/llvm/test/CodeGen/X86/ispow2.ll b/llvm/test/CodeGen/X86/ispow2.ll
index 4051e4d7f5b5dc0..8723432de8b6b00 100644
--- a/llvm/test/CodeGen/X86/ispow2.ll
+++ b/llvm/test/CodeGen/X86/ispow2.ll
@@ -28,25 +28,14 @@ define i1 @is_pow2_non_zero(i32 %xin) {
}
define i1 @is_pow2_non_zero_x_maybe_z(i32 %x) {
-; CHECK-NOBMI-LABEL: is_pow2_non_zero_x_maybe_z:
-; CHECK-NOBMI: # %bb.0:
-; CHECK-NOBMI-NEXT: # kill: def $edi killed $edi def $rdi
-; CHECK-NOBMI-NEXT: leal -1(%rdi), %eax
-; CHECK-NOBMI-NEXT: testl %eax, %edi
-; CHECK-NOBMI-NEXT: sete %cl
-; CHECK-NOBMI-NEXT: testl %edi, %edi
-; CHECK-NOBMI-NEXT: setne %al
-; CHECK-NOBMI-NEXT: andb %cl, %al
-; CHECK-NOBMI-NEXT: retq
-;
-; CHECK-BMI2-LABEL: is_pow2_non_zero_x_maybe_z:
-; CHECK-BMI2: # %bb.0:
-; CHECK-BMI2-NEXT: testl %edi, %edi
-; CHECK-BMI2-NEXT: setne %cl
-; CHECK-BMI2-NEXT: blsrl %edi, %eax
-; CHECK-BMI2-NEXT: sete %al
-; CHECK-BMI2-NEXT: andb %cl, %al
-; CHECK-BMI2-NEXT: retq
+; CHECK-LABEL: is_pow2_non_zero_x_maybe_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT: leal -1(%rdi), %eax
+; CHECK-NEXT: xorl %eax, %edi
+; CHECK-NEXT: cmpl %eax, %edi
+; CHECK-NEXT: seta %al
+; CHECK-NEXT: retq
%cnt = call i32 @llvm.ctpop.i32(i32 %x)
%r = icmp eq i32 %cnt, 1
ret i1 %r
@@ -180,44 +169,40 @@ define <4 x i1> @neither_pow2_non_zero_4xv64(<4 x i64> %xin) {
define <4 x i1> @neither_pow2_non_zero_4xv64_x_maybe_z(<4 x i64> %x) {
; CHECK-NOBMI-LABEL: neither_pow2_non_zero_4xv64_x_maybe_z:
; CHECK-NOBMI: # %bb.0:
-; CHECK-NOBMI-NEXT: pxor %xmm2, %xmm2
-; CHECK-NOBMI-NEXT: pcmpeqd %xmm3, %xmm3
-; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm4
-; CHECK-NOBMI-NEXT: paddq %xmm3, %xmm4
-; CHECK-NOBMI-NEXT: pand %xmm1, %xmm4
-; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm1
-; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,0,3,2]
-; CHECK-NOBMI-NEXT: pand %xmm1, %xmm5
-; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm4
-; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2]
-; CHECK-NOBMI-NEXT: pand %xmm4, %xmm1
+; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm2
+; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm3
+; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm3
+; CHECK-NOBMI-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; CHECK-NOBMI-NEXT: pxor %xmm4, %xmm3
; CHECK-NOBMI-NEXT: pxor %xmm3, %xmm1
-; CHECK-NOBMI-NEXT: por %xmm5, %xmm1
-; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm4
-; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm4
-; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2]
-; CHECK-NOBMI-NEXT: pand %xmm4, %xmm5
+; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm5
+; CHECK-NOBMI-NEXT: pcmpgtd %xmm3, %xmm5
+; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm6
+; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm6
+; CHECK-NOBMI-NEXT: pxor %xmm4, %xmm6
+; CHECK-NOBMI-NEXT: pxor %xmm6, %xmm0
; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm4
-; CHECK-NOBMI-NEXT: paddq %xmm3, %xmm4
-; CHECK-NOBMI-NEXT: pand %xmm4, %xmm0
-; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm0
-; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2]
-; CHECK-NOBMI-NEXT: pand %xmm2, %xmm0
-; CHECK-NOBMI-NEXT: pxor %xmm3, %xmm0
-; CHECK-NOBMI-NEXT: por %xmm5, %xmm0
-; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; CHECK-NOBMI-NEXT: pcmpgtd %xmm6, %xmm4
+; CHECK-NOBMI-NEXT: movdqa %xmm4, %xmm7
+; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm5[0,2]
+; CHECK-NOBMI-NEXT: pcmpeqd %xmm3, %xmm1
+; CHECK-NOBMI-NEXT: pcmpeqd %xmm6, %xmm0
+; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; CHECK-NOBMI-NEXT: andps %xmm7, %xmm0
+; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm5[1,3]
+; CHECK-NOBMI-NEXT: orps %xmm4, %xmm0
+; CHECK-NOBMI-NEXT: xorps %xmm2, %xmm0
; CHECK-NOBMI-NEXT: retq
;
; CHECK-AVX2-LABEL: neither_pow2_non_zero_4xv64_x_maybe_z:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm2
-; CHECK-AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; CHECK-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm4
-; CHECK-AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
-; CHECK-AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
-; CHECK-AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0
-; CHECK-AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
+; CHECK-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; CHECK-AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2
+; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; CHECK-AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
+; CHECK-AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0
+; CHECK-AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0
+; CHECK-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; CHECK-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; CHECK-AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vzeroupper
@@ -235,5 +220,3 @@ define <4 x i1> @neither_pow2_non_zero_4xv64_x_maybe_z(<4 x i64> %x) {
%r = icmp ne <4 x i64> %cnt, <i64 1, i64 1, i64 1, i64 1>
ret <4 x i1> %r
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll
index 61f0885c55be438..58cacfb0485ec6a 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll
@@ -720,85 +720,37 @@ define <16 x i8> @foldv16i8() nounwind {
}
define <2 x i64> @eq_1_v2i64(<2 x i64> %0) {
-; SSE2-LABEL: eq_1_v2i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT: paddq %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE3-LABEL: eq_1_v2i64:
-; SSE3: # %bb.0:
-; SSE3-NEXT: pxor %xmm1, %xmm1
-; SSE3-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE3-NEXT: paddq %xmm0, %xmm2
-; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: movdqa %xmm0, %xmm3
-; SSE3-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSE3-NEXT: pand %xmm3, %xmm0
-; SSE3-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
-; SSE3-NEXT: pand %xmm2, %xmm1
-; SSE3-NEXT: pandn %xmm1, %xmm0
-; SSE3-NEXT: retq
-;
-; SSSE3-LABEL: eq_1_v2i64:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
-; SSSE3-NEXT: paddq %xmm0, %xmm2
-; SSSE3-NEXT: pand %xmm0, %xmm2
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
-; SSSE3-NEXT: pand %xmm2, %xmm1
-; SSSE3-NEXT: pandn %xmm1, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: eq_1_v2i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: paddq %xmm0, %xmm2
-; SSE41-NEXT: pand %xmm0, %xmm2
-; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqq %xmm1, %xmm2
-; SSE41-NEXT: pandn %xmm2, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: eq_1_v2i64:
+; SSE: # %bb.0:
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: paddq %xmm0, %xmm1
+; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX1OR2-LABEL: eq_1_v2i64:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm2
-; AVX1OR2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1OR2-NEXT: vpaddq %xmm3, %xmm0, %xmm3
-; AVX1OR2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpandn %xmm0, %xmm2, %xmm0
+; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1OR2-NEXT: vpxor %xmm0, %xmm1, %xmm0
+; AVX1OR2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1OR2-NEXT: retq
;
; XOP-LABEL: eq_1_v2i64:
; XOP: # %bb.0:
-; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; XOP-NEXT: vpcomneqq %xmm1, %xmm0, %xmm2
-; XOP-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; XOP-NEXT: vpaddq %xmm3, %xmm0, %xmm3
-; XOP-NEXT: vpand %xmm3, %xmm0, %xmm0
-; XOP-NEXT: vpcomeqq %xmm1, %xmm0, %xmm0
-; XOP-NEXT: vpand %xmm0, %xmm2, %xmm0
+; XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; XOP-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vpcomgtuq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: eq_1_v2i64:
@@ -818,24 +770,24 @@ define <2 x i64> @eq_1_v2i64(<2 x i64> %0) {
;
; BITALG_NOVLX-LABEL: eq_1_v2i64:
; BITALG_NOVLX: # %bb.0:
-; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm2
-; BITALG_NOVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; BITALG_NOVLX-NEXT: vpaddq %xmm3, %xmm0, %xmm3
-; BITALG_NOVLX-NEXT: vpand %xmm3, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpminuq %zmm1, %zmm0, %zmm1
; BITALG_NOVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpandn %xmm0, %xmm2, %xmm0
+; BITALG_NOVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
+; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: eq_1_v2i64:
; BITALG: # %bb.0:
-; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm2
-; BITALG-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; BITALG-NEXT: vpaddq %xmm3, %xmm0, %xmm3
-; BITALG-NEXT: vpand %xmm3, %xmm0, %xmm0
+; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; BITALG-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; BITALG-NEXT: vpminuq %xmm1, %xmm0, %xmm1
; BITALG-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; BITALG-NEXT: vpandn %xmm0, %xmm2, %xmm0
+; BITALG-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
; BITALG-NEXT: retq
%2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
%3 = icmp eq <2 x i64> %2, <i64 1, i64 1>
@@ -844,95 +796,40 @@ define <2 x i64> @eq_1_v2i64(<2 x i64> %0) {
}
define <2 x i64> @ne_1_v2i64(<2 x i64> %0) {
-; SSE2-LABEL: ne_1_v2i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: paddq %xmm2, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE3-LABEL: ne_1_v2i64:
-; SSE3: # %bb.0:
-; SSE3-NEXT: pxor %xmm1, %xmm1
-; SSE3-NEXT: movdqa %xmm0, %xmm2
-; SSE3-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
-; SSE3-NEXT: pand %xmm2, %xmm3
-; SSE3-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE3-NEXT: movdqa %xmm0, %xmm4
-; SSE3-NEXT: paddq %xmm2, %xmm4
-; SSE3-NEXT: pand %xmm4, %xmm0
-; SSE3-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
-; SSE3-NEXT: pand %xmm1, %xmm0
-; SSE3-NEXT: pxor %xmm2, %xmm0
-; SSE3-NEXT: por %xmm3, %xmm0
-; SSE3-NEXT: retq
-;
-; SSSE3-LABEL: ne_1_v2i64:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: paddq %xmm2, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
-; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: por %xmm3, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: ne_1_v2i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pcmpeqq %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: paddq %xmm3, %xmm4
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: ne_1_v2i64:
+; SSE: # %bb.0:
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: paddq %xmm1, %xmm2
+; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX1OR2-LABEL: ne_1_v2i64:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm2
-; AVX1OR2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1OR2-NEXT: vpaddq %xmm3, %xmm0, %xmm4
-; AVX1OR2-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpxor %xmm3, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpor %xmm0, %xmm2, %xmm0
+; A...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/72275
More information about the llvm-commits
mailing list