[llvm] [SDAG] Simplify is-power-of-2 codegen (PR #72275)

via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 14 07:55:16 PST 2023


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86

@llvm/pr-subscribers-llvm-selectiondag

Author: Tavian Barnes (tavianator)

<details>
<summary>Changes</summary>

When x is not known to be nonzero, ctpop(x) == 1 is expanded to

    x != 0 && (x & (x - 1)) == 0

resulting in codegen like

    leal    -1(%rdi), %eax
    testl   %eax, %edi
    sete    %cl
    testl   %edi, %edi
    setne   %al
    andb    %cl, %al

But another expression that works is

    (x ^ (x - 1)) > x - 1

which has nicer codegen:

    leal    -1(%rdi), %eax
    xorl    %eax, %edi
    cmpl    %eax, %edi
    seta    %al


---

Patch is 104.71 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/72275.diff


6 Files Affected:

- (modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+14-11) 
- (modified) llvm/test/CodeGen/X86/ctpop-combine.ll (+30-37) 
- (modified) llvm/test/CodeGen/X86/ispow2.ll (+36-53) 
- (modified) llvm/test/CodeGen/X86/vector-popcnt-128.ll (+367-417) 
- (modified) llvm/test/CodeGen/X86/vector-popcnt-256.ll (+256-354) 
- (modified) llvm/test/CodeGen/X86/vector-popcnt-512.ll (+112-146) 


``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index ed352c86eca06e5..c47e2ad418a0e8e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4237,9 +4237,7 @@ static SDValue simplifySetCCWithCTPOP(const TargetLowering &TLI, EVT VT,
     return DAG.getSetCC(dl, VT, Result, DAG.getConstant(0, dl, CTVT), CC);
   }
 
-  // Expand a power-of-2 comparison based on ctpop:
-  // (ctpop x) == 1 --> (x != 0) && ((x & x-1) == 0)
-  // (ctpop x) != 1 --> (x == 0) || ((x & x-1) != 0)
+  // Expand a power-of-2 comparison based on ctpop
   if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && C1 == 1) {
     // Keep the CTPOP if it is cheap.
     if (TLI.isCtpopFast(CTVT))
@@ -4248,17 +4246,22 @@ static SDValue simplifySetCCWithCTPOP(const TargetLowering &TLI, EVT VT,
     SDValue Zero = DAG.getConstant(0, dl, CTVT);
     SDValue NegOne = DAG.getAllOnesConstant(dl, CTVT);
     assert(CTVT.isInteger());
-    ISD::CondCode InvCond = ISD::getSetCCInverse(Cond, CTVT);
     SDValue Add = DAG.getNode(ISD::ADD, dl, CTVT, CTOp, NegOne);
-    SDValue And = DAG.getNode(ISD::AND, dl, CTVT, CTOp, Add);
-    SDValue RHS = DAG.getSetCC(dl, VT, And, Zero, Cond);
     // Its not uncommon for known-never-zero X to exist in (ctpop X) eq/ne 1, so
-    // check before the emit a potentially unnecessary op.
-    if (DAG.isKnownNeverZero(CTOp))
+    // check before emitting a potentially unnecessary op.
+    if (DAG.isKnownNeverZero(CTOp)) {
+      // (ctpop x) == 1 --> (x & x-1) == 0
+      // (ctpop x) != 1 --> (x & x-1) != 0
+      SDValue And = DAG.getNode(ISD::AND, dl, CTVT, CTOp, Add);
+      SDValue RHS = DAG.getSetCC(dl, VT, And, Zero, Cond);
       return RHS;
-    SDValue LHS = DAG.getSetCC(dl, VT, CTOp, Zero, InvCond);
-    unsigned LogicOpcode = Cond == ISD::SETEQ ? ISD::AND : ISD::OR;
-    return DAG.getNode(LogicOpcode, dl, VT, LHS, RHS);
+    } else {
+      // (ctpop x) == 1 --> (x ^ x-1) >  x-1
+      // (ctpop x) != 1 --> (x ^ x-1) <= x-1
+      SDValue Xor = DAG.getNode(ISD::XOR, dl, CTVT, CTOp, Add);
+      ISD::CondCode CmpCond = Cond == ISD::SETEQ ? ISD::SETUGT : ISD::SETULE;
+      return DAG.getSetCC(dl, VT, Xor, Add, CmpCond);
+    }
   }
 
   return SDValue();
diff --git a/llvm/test/CodeGen/X86/ctpop-combine.ll b/llvm/test/CodeGen/X86/ctpop-combine.ll
index a33319e66d5f111..fba44218e05726b 100644
--- a/llvm/test/CodeGen/X86/ctpop-combine.ll
+++ b/llvm/test/CodeGen/X86/ctpop-combine.ll
@@ -120,13 +120,11 @@ define i32 @ctpop_eq_one(i64 %x) nounwind readnone {
 ;
 ; NO-POPCOUNT-LABEL: ctpop_eq_one:
 ; NO-POPCOUNT:       # %bb.0:
-; NO-POPCOUNT-NEXT:    leaq -1(%rdi), %rax
-; NO-POPCOUNT-NEXT:    testq %rax, %rdi
-; NO-POPCOUNT-NEXT:    sete %al
-; NO-POPCOUNT-NEXT:    testq %rdi, %rdi
-; NO-POPCOUNT-NEXT:    setne %cl
-; NO-POPCOUNT-NEXT:    andb %al, %cl
-; NO-POPCOUNT-NEXT:    movzbl %cl, %eax
+; NO-POPCOUNT-NEXT:    leaq -1(%rdi), %rcx
+; NO-POPCOUNT-NEXT:    xorq %rcx, %rdi
+; NO-POPCOUNT-NEXT:    xorl %eax, %eax
+; NO-POPCOUNT-NEXT:    cmpq %rcx, %rdi
+; NO-POPCOUNT-NEXT:    seta %al
 ; NO-POPCOUNT-NEXT:    retq
   %count = tail call i64 @llvm.ctpop.i64(i64 %x)
   %cmp = icmp eq i64 %count, 1
@@ -145,13 +143,11 @@ define i32 @ctpop_ne_one(i64 %x) nounwind readnone {
 ;
 ; NO-POPCOUNT-LABEL: ctpop_ne_one:
 ; NO-POPCOUNT:       # %bb.0:
-; NO-POPCOUNT-NEXT:    leaq -1(%rdi), %rax
-; NO-POPCOUNT-NEXT:    testq %rax, %rdi
-; NO-POPCOUNT-NEXT:    setne %al
-; NO-POPCOUNT-NEXT:    testq %rdi, %rdi
-; NO-POPCOUNT-NEXT:    sete %cl
-; NO-POPCOUNT-NEXT:    orb %al, %cl
-; NO-POPCOUNT-NEXT:    movzbl %cl, %eax
+; NO-POPCOUNT-NEXT:    leaq -1(%rdi), %rcx
+; NO-POPCOUNT-NEXT:    xorq %rcx, %rdi
+; NO-POPCOUNT-NEXT:    xorl %eax, %eax
+; NO-POPCOUNT-NEXT:    cmpq %rcx, %rdi
+; NO-POPCOUNT-NEXT:    setbe %al
 ; NO-POPCOUNT-NEXT:    retq
   %count = tail call i64 @llvm.ctpop.i64(i64 %x)
   %cmp = icmp ne i64 %count, 1
@@ -162,29 +158,26 @@ define i32 @ctpop_ne_one(i64 %x) nounwind readnone {
 define i1 @ctpop_trunc_non_power2(i255 %x) nounwind {
 ; CHECK-LABEL: ctpop_trunc_non_power2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT:    movq %rcx, %r8
-; CHECK-NEXT:    andq %rax, %r8
-; CHECK-NEXT:    movq %rdi, %r9
-; CHECK-NEXT:    addq $-1, %r9
-; CHECK-NEXT:    movq %rsi, %r10
-; CHECK-NEXT:    adcq $-1, %r10
-; CHECK-NEXT:    movq %rdx, %r11
-; CHECK-NEXT:    adcq $-1, %r11
-; CHECK-NEXT:    adcq %rax, %rcx
-; CHECK-NEXT:    andq %rdi, %r9
-; CHECK-NEXT:    andq %rdx, %r11
-; CHECK-NEXT:    orq %r9, %r11
-; CHECK-NEXT:    andq %r8, %rcx
-; CHECK-NEXT:    andq %rsi, %r10
-; CHECK-NEXT:    orq %rcx, %r10
-; CHECK-NEXT:    orq %r11, %r10
-; CHECK-NEXT:    sete %cl
-; CHECK-NEXT:    orq %rdx, %rdi
-; CHECK-NEXT:    orq %rsi, %r8
-; CHECK-NEXT:    orq %rdi, %r8
-; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    andb %cl, %al
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    addq $-1, %rax
+; CHECK-NEXT:    movq %rsi, %r8
+; CHECK-NEXT:    adcq $-1, %r8
+; CHECK-NEXT:    movq %rdx, %r9
+; CHECK-NEXT:    adcq $-1, %r9
+; CHECK-NEXT:    movabsq $9223372036854775807, %r10 # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT:    movq %rcx, %r11
+; CHECK-NEXT:    adcq %r10, %r11
+; CHECK-NEXT:    xorq %r11, %rcx
+; CHECK-NEXT:    andq %r10, %r11
+; CHECK-NEXT:    andq %r10, %rcx
+; CHECK-NEXT:    xorq %r9, %rdx
+; CHECK-NEXT:    xorq %r8, %rsi
+; CHECK-NEXT:    xorq %rax, %rdi
+; CHECK-NEXT:    cmpq %rdi, %rax
+; CHECK-NEXT:    sbbq %rsi, %r8
+; CHECK-NEXT:    sbbq %rdx, %r9
+; CHECK-NEXT:    sbbq %rcx, %r11
+; CHECK-NEXT:    setb %al
 ; CHECK-NEXT:    retq
   %a = call i255 @llvm.ctpop.i255(i255 %x)
   %b = trunc i255 %a to i8 ; largest value from ctpop is 255, fits in 8 bits.
diff --git a/llvm/test/CodeGen/X86/ispow2.ll b/llvm/test/CodeGen/X86/ispow2.ll
index 4051e4d7f5b5dc0..8723432de8b6b00 100644
--- a/llvm/test/CodeGen/X86/ispow2.ll
+++ b/llvm/test/CodeGen/X86/ispow2.ll
@@ -28,25 +28,14 @@ define i1 @is_pow2_non_zero(i32 %xin) {
 }
 
 define i1 @is_pow2_non_zero_x_maybe_z(i32 %x) {
-; CHECK-NOBMI-LABEL: is_pow2_non_zero_x_maybe_z:
-; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    # kill: def $edi killed $edi def $rdi
-; CHECK-NOBMI-NEXT:    leal -1(%rdi), %eax
-; CHECK-NOBMI-NEXT:    testl %eax, %edi
-; CHECK-NOBMI-NEXT:    sete %cl
-; CHECK-NOBMI-NEXT:    testl %edi, %edi
-; CHECK-NOBMI-NEXT:    setne %al
-; CHECK-NOBMI-NEXT:    andb %cl, %al
-; CHECK-NOBMI-NEXT:    retq
-;
-; CHECK-BMI2-LABEL: is_pow2_non_zero_x_maybe_z:
-; CHECK-BMI2:       # %bb.0:
-; CHECK-BMI2-NEXT:    testl %edi, %edi
-; CHECK-BMI2-NEXT:    setne %cl
-; CHECK-BMI2-NEXT:    blsrl %edi, %eax
-; CHECK-BMI2-NEXT:    sete %al
-; CHECK-BMI2-NEXT:    andb %cl, %al
-; CHECK-BMI2-NEXT:    retq
+; CHECK-LABEL: is_pow2_non_zero_x_maybe_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    leal -1(%rdi), %eax
+; CHECK-NEXT:    xorl %eax, %edi
+; CHECK-NEXT:    cmpl %eax, %edi
+; CHECK-NEXT:    seta %al
+; CHECK-NEXT:    retq
   %cnt = call i32 @llvm.ctpop.i32(i32 %x)
   %r = icmp eq i32 %cnt, 1
   ret i1 %r
@@ -180,44 +169,40 @@ define <4 x i1> @neither_pow2_non_zero_4xv64(<4 x i64> %xin) {
 define <4 x i1> @neither_pow2_non_zero_4xv64_x_maybe_z(<4 x i64> %x) {
 ; CHECK-NOBMI-LABEL: neither_pow2_non_zero_4xv64_x_maybe_z:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    pxor %xmm2, %xmm2
-; CHECK-NOBMI-NEXT:    pcmpeqd %xmm3, %xmm3
-; CHECK-NOBMI-NEXT:    movdqa %xmm1, %xmm4
-; CHECK-NOBMI-NEXT:    paddq %xmm3, %xmm4
-; CHECK-NOBMI-NEXT:    pand %xmm1, %xmm4
-; CHECK-NOBMI-NEXT:    pcmpeqd %xmm2, %xmm1
-; CHECK-NOBMI-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,0,3,2]
-; CHECK-NOBMI-NEXT:    pand %xmm1, %xmm5
-; CHECK-NOBMI-NEXT:    pcmpeqd %xmm2, %xmm4
-; CHECK-NOBMI-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2]
-; CHECK-NOBMI-NEXT:    pand %xmm4, %xmm1
+; CHECK-NOBMI-NEXT:    pcmpeqd %xmm2, %xmm2
+; CHECK-NOBMI-NEXT:    movdqa %xmm1, %xmm3
+; CHECK-NOBMI-NEXT:    paddq %xmm2, %xmm3
+; CHECK-NOBMI-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; CHECK-NOBMI-NEXT:    pxor %xmm4, %xmm3
 ; CHECK-NOBMI-NEXT:    pxor %xmm3, %xmm1
-; CHECK-NOBMI-NEXT:    por %xmm5, %xmm1
-; CHECK-NOBMI-NEXT:    movdqa %xmm0, %xmm4
-; CHECK-NOBMI-NEXT:    pcmpeqd %xmm2, %xmm4
-; CHECK-NOBMI-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2]
-; CHECK-NOBMI-NEXT:    pand %xmm4, %xmm5
+; CHECK-NOBMI-NEXT:    movdqa %xmm1, %xmm5
+; CHECK-NOBMI-NEXT:    pcmpgtd %xmm3, %xmm5
+; CHECK-NOBMI-NEXT:    movdqa %xmm0, %xmm6
+; CHECK-NOBMI-NEXT:    paddq %xmm2, %xmm6
+; CHECK-NOBMI-NEXT:    pxor %xmm4, %xmm6
+; CHECK-NOBMI-NEXT:    pxor %xmm6, %xmm0
 ; CHECK-NOBMI-NEXT:    movdqa %xmm0, %xmm4
-; CHECK-NOBMI-NEXT:    paddq %xmm3, %xmm4
-; CHECK-NOBMI-NEXT:    pand %xmm4, %xmm0
-; CHECK-NOBMI-NEXT:    pcmpeqd %xmm2, %xmm0
-; CHECK-NOBMI-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2]
-; CHECK-NOBMI-NEXT:    pand %xmm2, %xmm0
-; CHECK-NOBMI-NEXT:    pxor %xmm3, %xmm0
-; CHECK-NOBMI-NEXT:    por %xmm5, %xmm0
-; CHECK-NOBMI-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; CHECK-NOBMI-NEXT:    pcmpgtd %xmm6, %xmm4
+; CHECK-NOBMI-NEXT:    movdqa %xmm4, %xmm7
+; CHECK-NOBMI-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,2],xmm5[0,2]
+; CHECK-NOBMI-NEXT:    pcmpeqd %xmm3, %xmm1
+; CHECK-NOBMI-NEXT:    pcmpeqd %xmm6, %xmm0
+; CHECK-NOBMI-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; CHECK-NOBMI-NEXT:    andps %xmm7, %xmm0
+; CHECK-NOBMI-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,3],xmm5[1,3]
+; CHECK-NOBMI-NEXT:    orps %xmm4, %xmm0
+; CHECK-NOBMI-NEXT:    xorps %xmm2, %xmm0
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: neither_pow2_non_zero_4xv64_x_maybe_z:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm2
-; CHECK-AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; CHECK-AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm4
-; CHECK-AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
-; CHECK-AVX2-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
-; CHECK-AVX2-NEXT:    vpxor %ymm3, %ymm0, %ymm0
-; CHECK-AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; CHECK-AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; CHECK-AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm2
+; CHECK-AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; CHECK-AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; CHECK-AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; CHECK-AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm0
+; CHECK-AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; CHECK-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vzeroupper
@@ -235,5 +220,3 @@ define <4 x i1> @neither_pow2_non_zero_4xv64_x_maybe_z(<4 x i64> %x) {
   %r = icmp ne <4 x i64> %cnt, <i64 1, i64 1, i64 1, i64 1>
   ret <4 x i1> %r
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll
index 61f0885c55be438..58cacfb0485ec6a 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll
@@ -720,85 +720,37 @@ define <16 x i8> @foldv16i8() nounwind {
 }
 
 define <2 x i64> @eq_1_v2i64(<2 x i64> %0) {
-; SSE2-LABEL: eq_1_v2i64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT:    paddq %xmm0, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pandn %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE3-LABEL: eq_1_v2i64:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    pxor %xmm1, %xmm1
-; SSE3-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE3-NEXT:    paddq %xmm0, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSE3-NEXT:    pcmpeqd %xmm1, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSE3-NEXT:    pand %xmm3, %xmm0
-; SSE3-NEXT:    pcmpeqd %xmm1, %xmm2
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
-; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    pandn %xmm1, %xmm0
-; SSE3-NEXT:    retq
-;
-; SSSE3-LABEL: eq_1_v2i64:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSSE3-NEXT:    paddq %xmm0, %xmm2
-; SSSE3-NEXT:    pand %xmm0, %xmm2
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm3, %xmm0
-; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    pandn %xmm1, %xmm0
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: eq_1_v2i64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT:    paddq %xmm0, %xmm2
-; SSE41-NEXT:    pand %xmm0, %xmm2
-; SSE41-NEXT:    pcmpeqq %xmm1, %xmm0
-; SSE41-NEXT:    pcmpeqq %xmm1, %xmm2
-; SSE41-NEXT:    pandn %xmm2, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: eq_1_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE-NEXT:    paddq %xmm0, %xmm1
+; SSE-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX1OR2-LABEL: eq_1_v2i64:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm2
-; AVX1OR2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1OR2-NEXT:    vpaddq %xmm3, %xmm0, %xmm3
-; AVX1OR2-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; AVX1OR2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX1OR2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1OR2-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX1OR2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; AVX1OR2-NEXT:    retq
 ;
 ; XOP-LABEL: eq_1_v2i64:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; XOP-NEXT:    vpcomneqq %xmm1, %xmm0, %xmm2
-; XOP-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; XOP-NEXT:    vpaddq %xmm3, %xmm0, %xmm3
-; XOP-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; XOP-NEXT:    vpcomeqq %xmm1, %xmm0, %xmm0
-; XOP-NEXT:    vpand %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    vpcomgtuq %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: eq_1_v2i64:
@@ -818,24 +770,24 @@ define <2 x i64> @eq_1_v2i64(<2 x i64> %0) {
 ;
 ; BITALG_NOVLX-LABEL: eq_1_v2i64:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm2
-; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; BITALG_NOVLX-NEXT:    vpaddq %xmm3, %xmm0, %xmm3
-; BITALG_NOVLX-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vpminuq %zmm1, %zmm0, %zmm1
 ; BITALG_NOVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; BITALG_NOVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: eq_1_v2i64:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm2
-; BITALG-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; BITALG-NEXT:    vpaddq %xmm3, %xmm0, %xmm3
-; BITALG-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; BITALG-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; BITALG-NEXT:    vpminuq %xmm1, %xmm0, %xmm1
 ; BITALG-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
-; BITALG-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; BITALG-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
   %3 = icmp eq <2 x i64> %2, <i64 1, i64 1>
@@ -844,95 +796,40 @@ define <2 x i64> @eq_1_v2i64(<2 x i64> %0) {
 }
 
 define <2 x i64> @ne_1_v2i64(<2 x i64> %0) {
-; SSE2-LABEL: ne_1_v2i64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    paddq %xmm2, %xmm4
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm3, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE3-LABEL: ne_1_v2i64:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    pxor %xmm1, %xmm1
-; SSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSE3-NEXT:    pcmpeqd %xmm1, %xmm2
-; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
-; SSE3-NEXT:    pand %xmm2, %xmm3
-; SSE3-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE3-NEXT:    movdqa %xmm0, %xmm4
-; SSE3-NEXT:    paddq %xmm2, %xmm4
-; SSE3-NEXT:    pand %xmm4, %xmm0
-; SSE3-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
-; SSE3-NEXT:    pand %xmm1, %xmm0
-; SSE3-NEXT:    pxor %xmm2, %xmm0
-; SSE3-NEXT:    por %xmm3, %xmm0
-; SSE3-NEXT:    retq
-;
-; SSSE3-LABEL: ne_1_v2i64:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm2, %xmm3
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSSE3-NEXT:    movdqa %xmm0, %xmm4
-; SSSE3-NEXT:    paddq %xmm2, %xmm4
-; SSSE3-NEXT:    pand %xmm4, %xmm0
-; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm1, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
-; SSSE3-NEXT:    por %xmm3, %xmm0
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: ne_1_v2i64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    pcmpeqq %xmm1, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    paddq %xmm3, %xmm4
-; SSE41-NEXT:    pand %xmm4, %xmm0
-; SSE41-NEXT:    pcmpeqq %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm3, %xmm0
-; SSE41-NEXT:    por %xmm2, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: ne_1_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    paddq %xmm1, %xmm2
+; SSE-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX1OR2-LABEL: ne_1_v2i64:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm2
-; AVX1OR2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1OR2-NEXT:    vpaddq %xmm3, %xmm0, %xmm4
-; AVX1OR2-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpxor %xmm3, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; A...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/72275


More information about the llvm-commits mailing list