[llvm] r342989 - [X86] Add AVX512 support to combineVectorSizedSetCCEquality.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 25 09:27:12 PDT 2018


Author: ctopper
Date: Tue Sep 25 09:27:12 2018
New Revision: 342989

URL: http://llvm.org/viewvc/llvm-project?rev=342989&view=rev
Log:
[X86] Add AVX512 support to combineVectorSizedSetCCEquality.

Reviewers: spatel, RKSimon

Reviewed By: spatel

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D52424

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/setcc-wide-types.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=342989&r1=342988&r2=342989&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Sep 25 09:27:12 2018
@@ -38653,12 +38653,15 @@ static SDValue combineVectorSizedSetCCEq
     return SDValue();
 
   // TODO: Use PXOR + PTEST for SSE4.1 or later?
-  // TODO: Add support for AVX-512.
   EVT VT = SetCC->getValueType(0);
   SDLoc DL(SetCC);
   if ((OpSize == 128 && Subtarget.hasSSE2()) ||
-      (OpSize == 256 && Subtarget.hasAVX2())) {
-    EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
+      (OpSize == 256 && Subtarget.hasAVX2()) ||
+      (OpSize == 512 && Subtarget.useAVX512Regs())) {
+    EVT VecVT = OpSize == 512 ? MVT::v16i32 :
+                OpSize == 256 ? MVT::v32i8 :
+                                MVT::v16i8;
+    EVT CmpVT = OpSize == 512 ? MVT::v16i1 : VecVT;
     SDValue Cmp;
     if (IsOrXorXorCCZero) {
       // This is a bitwise-combined equality comparison of 2 pairs of vectors:
@@ -38669,14 +38672,18 @@ static SDValue combineVectorSizedSetCCEq
       SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
       SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
       SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
-      SDValue Cmp1 = DAG.getSetCC(DL, VecVT, A, B, ISD::SETEQ);
-      SDValue Cmp2 = DAG.getSetCC(DL, VecVT, C, D, ISD::SETEQ);
-      Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2);
+      SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
+      SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ);
+      Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2);
     } else {
       SDValue VecX = DAG.getBitcast(VecVT, X);
       SDValue VecY = DAG.getBitcast(VecVT, Y);
-      Cmp = DAG.getSetCC(DL, VecVT, VecX, VecY, ISD::SETEQ);
+      Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
     }
+    // For 512-bits we want to emit a setcc that will lower to kortest.
+    if (OpSize == 512)
+      return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i16, Cmp),
+                          DAG.getConstant(0xFFFF, DL, MVT::i16), CC);
     // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
     // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
     // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne

Modified: llvm/trunk/test/CodeGen/X86/setcc-wide-types.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/setcc-wide-types.ll?rev=342989&r1=342988&r2=342989&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/setcc-wide-types.ll (original)
+++ llvm/trunk/test/CodeGen/X86/setcc-wide-types.ll Tue Sep 25 09:27:12 2018
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2     | FileCheck %s --check-prefix=ANY --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx      | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2     | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f  | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2     | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx      | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=AVXANY --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2     | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f  | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX512 --check-prefix=AVX512BW
 
 ; Equality checks of 128/256-bit values can use PMOVMSK or PTEST to avoid scalarization.
 
@@ -319,93 +319,14 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: ne_i512:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT:    vmovq %xmm2, %rdx
-; AVX512F-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
-; AVX512F-NEXT:    vmovq %xmm3, %rsi
-; AVX512F-NEXT:    vmovq %xmm0, %rdi
-; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
-; AVX512F-NEXT:    vmovq %xmm4, %rax
-; AVX512F-NEXT:    vpextrq $1, %xmm2, %r11
-; AVX512F-NEXT:    vpextrq $1, %xmm3, %r10
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %r9
-; AVX512F-NEXT:    vpextrq $1, %xmm4, %r8
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512F-NEXT:    vmovq %xmm0, %rcx
-; AVX512F-NEXT:    xorq %rdx, %rcx
-; AVX512F-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; AVX512F-NEXT:    vmovq %xmm2, %rdx
-; AVX512F-NEXT:    xorq %rsi, %rdx
-; AVX512F-NEXT:    orq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %xmm1, %rcx
-; AVX512F-NEXT:    xorq %rdi, %rcx
-; AVX512F-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; AVX512F-NEXT:    vmovq %xmm3, %rsi
-; AVX512F-NEXT:    xorq %rax, %rsi
-; AVX512F-NEXT:    orq %rdx, %rsi
-; AVX512F-NEXT:    orq %rcx, %rsi
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    xorq %r11, %rax
-; AVX512F-NEXT:    vpextrq $1, %xmm2, %rcx
-; AVX512F-NEXT:    xorq %r10, %rcx
-; AVX512F-NEXT:    orq %rax, %rcx
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT:    xorq %r9, %rax
-; AVX512F-NEXT:    vpextrq $1, %xmm3, %rdx
-; AVX512F-NEXT:    xorq %r8, %rdx
-; AVX512F-NEXT:    orq %rcx, %rdx
-; AVX512F-NEXT:    orq %rax, %rdx
-; AVX512F-NEXT:    xorl %eax, %eax
-; AVX512F-NEXT:    orq %rsi, %rdx
-; AVX512F-NEXT:    setne %al
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: ne_i512:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512BW-NEXT:    vmovq %xmm2, %rdx
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
-; AVX512BW-NEXT:    vmovq %xmm3, %rsi
-; AVX512BW-NEXT:    vmovq %xmm0, %rdi
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
-; AVX512BW-NEXT:    vmovq %xmm4, %rax
-; AVX512BW-NEXT:    vpextrq $1, %xmm2, %r11
-; AVX512BW-NEXT:    vpextrq $1, %xmm3, %r10
-; AVX512BW-NEXT:    vpextrq $1, %xmm0, %r9
-; AVX512BW-NEXT:    vpextrq $1, %xmm4, %r8
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512BW-NEXT:    vmovq %xmm0, %rcx
-; AVX512BW-NEXT:    xorq %rdx, %rcx
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; AVX512BW-NEXT:    vmovq %xmm2, %rdx
-; AVX512BW-NEXT:    xorq %rsi, %rdx
-; AVX512BW-NEXT:    orq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %xmm1, %rcx
-; AVX512BW-NEXT:    xorq %rdi, %rcx
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT:    vmovq %xmm3, %rsi
-; AVX512BW-NEXT:    xorq %rax, %rsi
-; AVX512BW-NEXT:    orq %rdx, %rsi
-; AVX512BW-NEXT:    orq %rcx, %rsi
-; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT:    xorq %r11, %rax
-; AVX512BW-NEXT:    vpextrq $1, %xmm2, %rcx
-; AVX512BW-NEXT:    xorq %r10, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512BW-NEXT:    xorq %r9, %rax
-; AVX512BW-NEXT:    vpextrq $1, %xmm3, %rdx
-; AVX512BW-NEXT:    xorq %r8, %rdx
-; AVX512BW-NEXT:    orq %rcx, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    xorl %eax, %eax
-; AVX512BW-NEXT:    orq %rsi, %rdx
-; AVX512BW-NEXT:    setne %al
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512-LABEL: ne_i512:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    kortestw %k0, %k0
+; AVX512-NEXT:    setae %al
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %bcx = bitcast <8 x i64> %x to i512
   %bcy = bitcast <8 x i64> %y to i512
   %cmp = icmp ne i512 %bcx, %bcy
@@ -543,93 +464,14 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: eq_i512:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT:    vmovq %xmm2, %rdx
-; AVX512F-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
-; AVX512F-NEXT:    vmovq %xmm3, %rsi
-; AVX512F-NEXT:    vmovq %xmm0, %rdi
-; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
-; AVX512F-NEXT:    vmovq %xmm4, %rax
-; AVX512F-NEXT:    vpextrq $1, %xmm2, %r11
-; AVX512F-NEXT:    vpextrq $1, %xmm3, %r10
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %r9
-; AVX512F-NEXT:    vpextrq $1, %xmm4, %r8
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512F-NEXT:    vmovq %xmm0, %rcx
-; AVX512F-NEXT:    xorq %rdx, %rcx
-; AVX512F-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; AVX512F-NEXT:    vmovq %xmm2, %rdx
-; AVX512F-NEXT:    xorq %rsi, %rdx
-; AVX512F-NEXT:    orq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %xmm1, %rcx
-; AVX512F-NEXT:    xorq %rdi, %rcx
-; AVX512F-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; AVX512F-NEXT:    vmovq %xmm3, %rsi
-; AVX512F-NEXT:    xorq %rax, %rsi
-; AVX512F-NEXT:    orq %rdx, %rsi
-; AVX512F-NEXT:    orq %rcx, %rsi
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    xorq %r11, %rax
-; AVX512F-NEXT:    vpextrq $1, %xmm2, %rcx
-; AVX512F-NEXT:    xorq %r10, %rcx
-; AVX512F-NEXT:    orq %rax, %rcx
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT:    xorq %r9, %rax
-; AVX512F-NEXT:    vpextrq $1, %xmm3, %rdx
-; AVX512F-NEXT:    xorq %r8, %rdx
-; AVX512F-NEXT:    orq %rcx, %rdx
-; AVX512F-NEXT:    orq %rax, %rdx
-; AVX512F-NEXT:    xorl %eax, %eax
-; AVX512F-NEXT:    orq %rsi, %rdx
-; AVX512F-NEXT:    sete %al
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: eq_i512:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512BW-NEXT:    vmovq %xmm2, %rdx
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
-; AVX512BW-NEXT:    vmovq %xmm3, %rsi
-; AVX512BW-NEXT:    vmovq %xmm0, %rdi
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
-; AVX512BW-NEXT:    vmovq %xmm4, %rax
-; AVX512BW-NEXT:    vpextrq $1, %xmm2, %r11
-; AVX512BW-NEXT:    vpextrq $1, %xmm3, %r10
-; AVX512BW-NEXT:    vpextrq $1, %xmm0, %r9
-; AVX512BW-NEXT:    vpextrq $1, %xmm4, %r8
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512BW-NEXT:    vmovq %xmm0, %rcx
-; AVX512BW-NEXT:    xorq %rdx, %rcx
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; AVX512BW-NEXT:    vmovq %xmm2, %rdx
-; AVX512BW-NEXT:    xorq %rsi, %rdx
-; AVX512BW-NEXT:    orq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %xmm1, %rcx
-; AVX512BW-NEXT:    xorq %rdi, %rcx
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT:    vmovq %xmm3, %rsi
-; AVX512BW-NEXT:    xorq %rax, %rsi
-; AVX512BW-NEXT:    orq %rdx, %rsi
-; AVX512BW-NEXT:    orq %rcx, %rsi
-; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT:    xorq %r11, %rax
-; AVX512BW-NEXT:    vpextrq $1, %xmm2, %rcx
-; AVX512BW-NEXT:    xorq %r10, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512BW-NEXT:    xorq %r9, %rax
-; AVX512BW-NEXT:    vpextrq $1, %xmm3, %rdx
-; AVX512BW-NEXT:    xorq %r8, %rdx
-; AVX512BW-NEXT:    orq %rcx, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    xorl %eax, %eax
-; AVX512BW-NEXT:    orq %rsi, %rdx
-; AVX512BW-NEXT:    sete %al
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512-LABEL: eq_i512:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    kortestw %k0, %k0
+; AVX512-NEXT:    setb %al
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %bcx = bitcast <8 x i64> %x to i512
   %bcy = bitcast <8 x i64> %y to i512
   %cmp = icmp eq i512 %bcx, %bcy
@@ -909,58 +751,70 @@ define i32 @eq_i256_pair(i256* %a, i256*
 ; if we allowed 2 pairs of 64-byte loads per block.
 
 define i32 @ne_i512_pair(i512* %a, i512* %b) {
-; ANY-LABEL: ne_i512_pair:
-; ANY:       # %bb.0:
-; ANY-NEXT:    movq 32(%rdi), %r8
-; ANY-NEXT:    movq 48(%rdi), %r9
-; ANY-NEXT:    movq 40(%rdi), %rdx
-; ANY-NEXT:    movq 56(%rdi), %rcx
-; ANY-NEXT:    xorq 56(%rsi), %rcx
-; ANY-NEXT:    movq 120(%rdi), %rax
-; ANY-NEXT:    xorq 120(%rsi), %rax
-; ANY-NEXT:    orq %rcx, %rax
-; ANY-NEXT:    movq 88(%rdi), %rcx
-; ANY-NEXT:    xorq 88(%rsi), %rcx
-; ANY-NEXT:    orq %rcx, %rax
-; ANY-NEXT:    movq 24(%rdi), %rcx
-; ANY-NEXT:    xorq 24(%rsi), %rcx
-; ANY-NEXT:    xorq 40(%rsi), %rdx
-; ANY-NEXT:    orq %rcx, %rax
-; ANY-NEXT:    movq 104(%rdi), %rcx
-; ANY-NEXT:    xorq 104(%rsi), %rcx
-; ANY-NEXT:    orq %rdx, %rcx
-; ANY-NEXT:    movq 72(%rdi), %rdx
-; ANY-NEXT:    xorq 72(%rsi), %rdx
-; ANY-NEXT:    orq %rdx, %rcx
-; ANY-NEXT:    movq 16(%rdi), %r10
-; ANY-NEXT:    orq %rax, %rcx
-; ANY-NEXT:    movq 8(%rdi), %rax
-; ANY-NEXT:    xorq 8(%rsi), %rax
-; ANY-NEXT:    xorq 48(%rsi), %r9
-; ANY-NEXT:    orq %rax, %rcx
-; ANY-NEXT:    movq 112(%rdi), %rax
-; ANY-NEXT:    xorq 112(%rsi), %rax
-; ANY-NEXT:    orq %r9, %rax
-; ANY-NEXT:    movq 80(%rdi), %rdx
-; ANY-NEXT:    xorq 80(%rsi), %rdx
-; ANY-NEXT:    orq %rdx, %rax
-; ANY-NEXT:    movq (%rdi), %r9
-; ANY-NEXT:    xorq 16(%rsi), %r10
-; ANY-NEXT:    xorq (%rsi), %r9
-; ANY-NEXT:    xorq 32(%rsi), %r8
-; ANY-NEXT:    orq %r10, %rax
-; ANY-NEXT:    movq 96(%rdi), %rdx
-; ANY-NEXT:    movq 64(%rdi), %rdi
-; ANY-NEXT:    xorq 64(%rsi), %rdi
-; ANY-NEXT:    xorq 96(%rsi), %rdx
-; ANY-NEXT:    orq %r8, %rdx
-; ANY-NEXT:    orq %rdi, %rdx
-; ANY-NEXT:    orq %rax, %rdx
-; ANY-NEXT:    orq %r9, %rdx
-; ANY-NEXT:    xorl %eax, %eax
-; ANY-NEXT:    orq %rcx, %rdx
-; ANY-NEXT:    setne %al
-; ANY-NEXT:    retq
+; NO512-LABEL: ne_i512_pair:
+; NO512:       # %bb.0:
+; NO512-NEXT:    movq 32(%rdi), %r8
+; NO512-NEXT:    movq 48(%rdi), %r9
+; NO512-NEXT:    movq 40(%rdi), %rdx
+; NO512-NEXT:    movq 56(%rdi), %rcx
+; NO512-NEXT:    xorq 56(%rsi), %rcx
+; NO512-NEXT:    movq 120(%rdi), %rax
+; NO512-NEXT:    xorq 120(%rsi), %rax
+; NO512-NEXT:    orq %rcx, %rax
+; NO512-NEXT:    movq 88(%rdi), %rcx
+; NO512-NEXT:    xorq 88(%rsi), %rcx
+; NO512-NEXT:    orq %rcx, %rax
+; NO512-NEXT:    movq 24(%rdi), %rcx
+; NO512-NEXT:    xorq 24(%rsi), %rcx
+; NO512-NEXT:    xorq 40(%rsi), %rdx
+; NO512-NEXT:    orq %rcx, %rax
+; NO512-NEXT:    movq 104(%rdi), %rcx
+; NO512-NEXT:    xorq 104(%rsi), %rcx
+; NO512-NEXT:    orq %rdx, %rcx
+; NO512-NEXT:    movq 72(%rdi), %rdx
+; NO512-NEXT:    xorq 72(%rsi), %rdx
+; NO512-NEXT:    orq %rdx, %rcx
+; NO512-NEXT:    movq 16(%rdi), %r10
+; NO512-NEXT:    orq %rax, %rcx
+; NO512-NEXT:    movq 8(%rdi), %rax
+; NO512-NEXT:    xorq 8(%rsi), %rax
+; NO512-NEXT:    xorq 48(%rsi), %r9
+; NO512-NEXT:    orq %rax, %rcx
+; NO512-NEXT:    movq 112(%rdi), %rax
+; NO512-NEXT:    xorq 112(%rsi), %rax
+; NO512-NEXT:    orq %r9, %rax
+; NO512-NEXT:    movq 80(%rdi), %rdx
+; NO512-NEXT:    xorq 80(%rsi), %rdx
+; NO512-NEXT:    orq %rdx, %rax
+; NO512-NEXT:    movq (%rdi), %r9
+; NO512-NEXT:    xorq 16(%rsi), %r10
+; NO512-NEXT:    xorq (%rsi), %r9
+; NO512-NEXT:    xorq 32(%rsi), %r8
+; NO512-NEXT:    orq %r10, %rax
+; NO512-NEXT:    movq 96(%rdi), %rdx
+; NO512-NEXT:    movq 64(%rdi), %rdi
+; NO512-NEXT:    xorq 64(%rsi), %rdi
+; NO512-NEXT:    xorq 96(%rsi), %rdx
+; NO512-NEXT:    orq %r8, %rdx
+; NO512-NEXT:    orq %rdi, %rdx
+; NO512-NEXT:    orq %rax, %rdx
+; NO512-NEXT:    orq %r9, %rdx
+; NO512-NEXT:    xorl %eax, %eax
+; NO512-NEXT:    orq %rcx, %rdx
+; NO512-NEXT:    setne %al
+; NO512-NEXT:    retq
+;
+; AVX512-LABEL: ne_i512_pair:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512-NEXT:    vmovdqu64 64(%rdi), %zmm1
+; AVX512-NEXT:    vpcmpeqd (%rsi), %zmm0, %k1
+; AVX512-NEXT:    vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1}
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    kortestw %k0, %k0
+; AVX512-NEXT:    setae %al
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %a0 = load i512, i512* %a
   %b0 = load i512, i512* %b
   %xor1 = xor i512 %a0, %b0
@@ -979,58 +833,70 @@ define i32 @ne_i512_pair(i512* %a, i512*
 ; if we allowed 2 pairs of 64-byte loads per block.
 
 define i32 @eq_i512_pair(i512* %a, i512* %b) {
-; ANY-LABEL: eq_i512_pair:
-; ANY:       # %bb.0:
-; ANY-NEXT:    movq 32(%rdi), %r8
-; ANY-NEXT:    movq 48(%rdi), %r9
-; ANY-NEXT:    movq 40(%rdi), %rdx
-; ANY-NEXT:    movq 56(%rdi), %rcx
-; ANY-NEXT:    xorq 56(%rsi), %rcx
-; ANY-NEXT:    movq 120(%rdi), %rax
-; ANY-NEXT:    xorq 120(%rsi), %rax
-; ANY-NEXT:    orq %rcx, %rax
-; ANY-NEXT:    movq 88(%rdi), %rcx
-; ANY-NEXT:    xorq 88(%rsi), %rcx
-; ANY-NEXT:    orq %rcx, %rax
-; ANY-NEXT:    movq 24(%rdi), %rcx
-; ANY-NEXT:    xorq 24(%rsi), %rcx
-; ANY-NEXT:    xorq 40(%rsi), %rdx
-; ANY-NEXT:    orq %rcx, %rax
-; ANY-NEXT:    movq 104(%rdi), %rcx
-; ANY-NEXT:    xorq 104(%rsi), %rcx
-; ANY-NEXT:    orq %rdx, %rcx
-; ANY-NEXT:    movq 72(%rdi), %rdx
-; ANY-NEXT:    xorq 72(%rsi), %rdx
-; ANY-NEXT:    orq %rdx, %rcx
-; ANY-NEXT:    movq 16(%rdi), %r10
-; ANY-NEXT:    orq %rax, %rcx
-; ANY-NEXT:    movq 8(%rdi), %rax
-; ANY-NEXT:    xorq 8(%rsi), %rax
-; ANY-NEXT:    xorq 48(%rsi), %r9
-; ANY-NEXT:    orq %rax, %rcx
-; ANY-NEXT:    movq 112(%rdi), %rax
-; ANY-NEXT:    xorq 112(%rsi), %rax
-; ANY-NEXT:    orq %r9, %rax
-; ANY-NEXT:    movq 80(%rdi), %rdx
-; ANY-NEXT:    xorq 80(%rsi), %rdx
-; ANY-NEXT:    orq %rdx, %rax
-; ANY-NEXT:    movq (%rdi), %r9
-; ANY-NEXT:    xorq 16(%rsi), %r10
-; ANY-NEXT:    xorq (%rsi), %r9
-; ANY-NEXT:    xorq 32(%rsi), %r8
-; ANY-NEXT:    orq %r10, %rax
-; ANY-NEXT:    movq 96(%rdi), %rdx
-; ANY-NEXT:    movq 64(%rdi), %rdi
-; ANY-NEXT:    xorq 64(%rsi), %rdi
-; ANY-NEXT:    xorq 96(%rsi), %rdx
-; ANY-NEXT:    orq %r8, %rdx
-; ANY-NEXT:    orq %rdi, %rdx
-; ANY-NEXT:    orq %rax, %rdx
-; ANY-NEXT:    orq %r9, %rdx
-; ANY-NEXT:    xorl %eax, %eax
-; ANY-NEXT:    orq %rcx, %rdx
-; ANY-NEXT:    sete %al
-; ANY-NEXT:    retq
+; NO512-LABEL: eq_i512_pair:
+; NO512:       # %bb.0:
+; NO512-NEXT:    movq 32(%rdi), %r8
+; NO512-NEXT:    movq 48(%rdi), %r9
+; NO512-NEXT:    movq 40(%rdi), %rdx
+; NO512-NEXT:    movq 56(%rdi), %rcx
+; NO512-NEXT:    xorq 56(%rsi), %rcx
+; NO512-NEXT:    movq 120(%rdi), %rax
+; NO512-NEXT:    xorq 120(%rsi), %rax
+; NO512-NEXT:    orq %rcx, %rax
+; NO512-NEXT:    movq 88(%rdi), %rcx
+; NO512-NEXT:    xorq 88(%rsi), %rcx
+; NO512-NEXT:    orq %rcx, %rax
+; NO512-NEXT:    movq 24(%rdi), %rcx
+; NO512-NEXT:    xorq 24(%rsi), %rcx
+; NO512-NEXT:    xorq 40(%rsi), %rdx
+; NO512-NEXT:    orq %rcx, %rax
+; NO512-NEXT:    movq 104(%rdi), %rcx
+; NO512-NEXT:    xorq 104(%rsi), %rcx
+; NO512-NEXT:    orq %rdx, %rcx
+; NO512-NEXT:    movq 72(%rdi), %rdx
+; NO512-NEXT:    xorq 72(%rsi), %rdx
+; NO512-NEXT:    orq %rdx, %rcx
+; NO512-NEXT:    movq 16(%rdi), %r10
+; NO512-NEXT:    orq %rax, %rcx
+; NO512-NEXT:    movq 8(%rdi), %rax
+; NO512-NEXT:    xorq 8(%rsi), %rax
+; NO512-NEXT:    xorq 48(%rsi), %r9
+; NO512-NEXT:    orq %rax, %rcx
+; NO512-NEXT:    movq 112(%rdi), %rax
+; NO512-NEXT:    xorq 112(%rsi), %rax
+; NO512-NEXT:    orq %r9, %rax
+; NO512-NEXT:    movq 80(%rdi), %rdx
+; NO512-NEXT:    xorq 80(%rsi), %rdx
+; NO512-NEXT:    orq %rdx, %rax
+; NO512-NEXT:    movq (%rdi), %r9
+; NO512-NEXT:    xorq 16(%rsi), %r10
+; NO512-NEXT:    xorq (%rsi), %r9
+; NO512-NEXT:    xorq 32(%rsi), %r8
+; NO512-NEXT:    orq %r10, %rax
+; NO512-NEXT:    movq 96(%rdi), %rdx
+; NO512-NEXT:    movq 64(%rdi), %rdi
+; NO512-NEXT:    xorq 64(%rsi), %rdi
+; NO512-NEXT:    xorq 96(%rsi), %rdx
+; NO512-NEXT:    orq %r8, %rdx
+; NO512-NEXT:    orq %rdi, %rdx
+; NO512-NEXT:    orq %rax, %rdx
+; NO512-NEXT:    orq %r9, %rdx
+; NO512-NEXT:    xorl %eax, %eax
+; NO512-NEXT:    orq %rcx, %rdx
+; NO512-NEXT:    sete %al
+; NO512-NEXT:    retq
+;
+; AVX512-LABEL: eq_i512_pair:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512-NEXT:    vmovdqu64 64(%rdi), %zmm1
+; AVX512-NEXT:    vpcmpeqd (%rsi), %zmm0, %k1
+; AVX512-NEXT:    vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1}
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    kortestw %k0, %k0
+; AVX512-NEXT:    setb %al
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %a0 = load i512, i512* %a
   %b0 = load i512, i512* %b
   %xor1 = xor i512 %a0, %b0




More information about the llvm-commits mailing list