[llvm] r338907 - [X86] Add a DAG combine for the __builtin_parity idiom used by clang to enable better codegen

Fri Aug 3 11:00:29 PDT 2018

Author: ctopper
Date: Fri Aug  3 11:00:29 2018
New Revision: 338907

URL: http://llvm.org/viewvc/llvm-project?rev=338907&view=rev
Log:
[X86] Add a DAG combine for the __builtin_parity idiom used by clang to enable better codegen

Clang uses "ctpop & 1" to implement __builtin_parity. If the popcnt instruction isn't supported this generates a large amount of code to calculate the population count. Instead we can bisect the data down to a single byte using xor and then check the parity flag.

Even when popcnt is supported, its still a good idea to split 64-bit data on 32-bit targets using an xor in front of a single popcnt. Otherwise we get two popcnts and an add before the and.

I've specifically targeted this at the sizes supported by clang builtins, but we could generalize this if we think that's useful.

Differential Revision: https://reviews.llvm.org/D50165

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/parity.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=338907&r1=338906&r2=338907&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Aug  3 11:00:29 2018
@@ -34710,6 +34710,73 @@ static SDValue combineAndLoadToBZHI(SDNo
   return SDValue();
 }
 
+// Look for (and (ctpop X), 1) which is the IR form of __builtin_parity.
+// Turn it into series of XORs and a setnp.
+static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
+                             const X86Subtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+
+  // We only support 64-bit and 32-bit. 64-bit requires special handling
+  // unless the 64-bit popcnt instruction is legal.
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT))
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // LHS needs to be a single use CTPOP.
+  if (N0.getOpcode() != ISD::CTPOP || !N0.hasOneUse())
+    return SDValue();
+
+  // RHS needs to be 1.
+  if (!isOneConstant(N1))
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue X = N0.getOperand(0);
+
+  // If this is 64-bit, its always best to xor the two 32-bit pieces together
+  // even if we have popcnt.
+  if (VT == MVT::i64) {
+    SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
+                             DAG.getNode(ISD::SRL, DL, VT, X,
+                                         DAG.getConstant(32, DL, MVT::i8)));
+    SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
+    X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
+    // Generate a 32-bit parity idiom. This will bring us back here if we need
+    // to expand it too.
+    SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32,
+                                 DAG.getNode(ISD::CTPOP, DL, MVT::i32, X),
+                                 DAG.getConstant(1, DL, MVT::i32));
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Parity);
+  }
+  assert(VT == MVT::i32 && "Unexpected VT!");
+
+  // Xor the high and low 16-bits together using a 32-bit operation.
+  SDValue Hi16 = DAG.getNode(ISD::SRL, DL, VT, X,
+                             DAG.getConstant(16, DL, MVT::i8));
+  X = DAG.getNode(ISD::XOR, DL, VT, X, Hi16);
+
+  // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
+  // This should allow an h-reg to be used to save a shift.
+  // FIXME: We only get an h-reg in 32-bit mode.
+  SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
+                           DAG.getNode(ISD::SRL, DL, VT, X,
+                                       DAG.getConstant(8, DL, MVT::i8)));
+  SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
+  SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
+  SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
+
+  // Copy the inverse of the parity flag into a register with setcc.
+  SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
+  // Zero extend to original type.
+  return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp);
+}
+
 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
@@ -34737,6 +34804,10 @@ static SDValue combineAnd(SDNode *N, Sel
     }
   }
 
+  // This must be done before legalization has expanded the ctpop.
+  if (SDValue V = combineParity(N, DAG, Subtarget))
+    return V;
+
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 

Modified: llvm/trunk/test/CodeGen/X86/parity.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/parity.ll?rev=338907&r1=338906&r2=338907&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/parity.ll (original)
+++ llvm/trunk/test/CodeGen/X86/parity.ll Fri Aug  3 11:00:29 2018
@@ -9,41 +9,23 @@ define i32 @parity_32(i32 %x) {
 ; X86-NOPOPCNT:       # %bb.0:
 ; X86-NOPOPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOPOPCNT-NEXT:    movl %eax, %ecx
-; X86-NOPOPCNT-NEXT:    shrl %ecx
-; X86-NOPOPCNT-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NOPOPCNT-NEXT:    subl %ecx, %eax
-; X86-NOPOPCNT-NEXT:    movl %eax, %ecx
-; X86-NOPOPCNT-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NOPOPCNT-NEXT:    shrl $2, %eax
-; X86-NOPOPCNT-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NOPOPCNT-NEXT:    addl %ecx, %eax
-; X86-NOPOPCNT-NEXT:    movl %eax, %ecx
-; X86-NOPOPCNT-NEXT:    shrl $4, %ecx
-; X86-NOPOPCNT-NEXT:    addl %eax, %ecx
-; X86-NOPOPCNT-NEXT:    andl $17764111, %ecx # imm = 0x10F0F0F
-; X86-NOPOPCNT-NEXT:    imull $16843009, %ecx, %eax # imm = 0x1010101
-; X86-NOPOPCNT-NEXT:    shrl $24, %eax
-; X86-NOPOPCNT-NEXT:    andl $1, %eax
+; X86-NOPOPCNT-NEXT:    shrl $16, %ecx
+; X86-NOPOPCNT-NEXT:    xorl %eax, %ecx
+; X86-NOPOPCNT-NEXT:    xorl %eax, %eax
+; X86-NOPOPCNT-NEXT:    xorb %ch, %cl
+; X86-NOPOPCNT-NEXT:    setnp %al
 ; X86-NOPOPCNT-NEXT:    retl
 ;
 ; X64-NOPOPCNT-LABEL: parity_32:
 ; X64-NOPOPCNT:       # %bb.0:
-; X64-NOPOPCNT-NEXT:    movl %edi, %eax
-; X64-NOPOPCNT-NEXT:    shrl %eax
-; X64-NOPOPCNT-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X64-NOPOPCNT-NEXT:    subl %eax, %edi
-; X64-NOPOPCNT-NEXT:    movl %edi, %eax
-; X64-NOPOPCNT-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X64-NOPOPCNT-NEXT:    shrl $2, %edi
-; X64-NOPOPCNT-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X64-NOPOPCNT-NEXT:    addl %eax, %edi
-; X64-NOPOPCNT-NEXT:    movl %edi, %eax
-; X64-NOPOPCNT-NEXT:    shrl $4, %eax
-; X64-NOPOPCNT-NEXT:    addl %edi, %eax
-; X64-NOPOPCNT-NEXT:    andl $17764111, %eax # imm = 0x10F0F0F
-; X64-NOPOPCNT-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
-; X64-NOPOPCNT-NEXT:    shrl $24, %eax
-; X64-NOPOPCNT-NEXT:    andl $1, %eax
+; X64-NOPOPCNT-NEXT:    movl %edi, %ecx
+; X64-NOPOPCNT-NEXT:    shrl $16, %ecx
+; X64-NOPOPCNT-NEXT:    xorl %edi, %ecx
+; X64-NOPOPCNT-NEXT:    movl %ecx, %edx
+; X64-NOPOPCNT-NEXT:    shrl $8, %edx
+; X64-NOPOPCNT-NEXT:    xorl %eax, %eax
+; X64-NOPOPCNT-NEXT:    xorb %cl, %dl
+; X64-NOPOPCNT-NEXT:    setnp %al
 ; X64-NOPOPCNT-NEXT:    retq
 ;
 ; X86-POPCNT-LABEL: parity_32:
@@ -66,71 +48,36 @@ define i64 @parity_64(i64 %x) {
 ; X86-NOPOPCNT-LABEL: parity_64:
 ; X86-NOPOPCNT:       # %bb.0:
 ; X86-NOPOPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOPOPCNT-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOPOPCNT-NEXT:    movl %ecx, %edx
-; X86-NOPOPCNT-NEXT:    shrl %edx
-; X86-NOPOPCNT-NEXT:    andl $1431655765, %edx # imm = 0x55555555
-; X86-NOPOPCNT-NEXT:    subl %edx, %ecx
-; X86-NOPOPCNT-NEXT:    movl %ecx, %edx
-; X86-NOPOPCNT-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NOPOPCNT-NEXT:    shrl $2, %ecx
-; X86-NOPOPCNT-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NOPOPCNT-NEXT:    addl %edx, %ecx
-; X86-NOPOPCNT-NEXT:    movl %ecx, %edx
-; X86-NOPOPCNT-NEXT:    shrl $4, %edx
-; X86-NOPOPCNT-NEXT:    addl %ecx, %edx
-; X86-NOPOPCNT-NEXT:    andl $17764111, %edx # imm = 0x10F0F0F
-; X86-NOPOPCNT-NEXT:    imull $16843009, %edx, %ecx # imm = 0x1010101
-; X86-NOPOPCNT-NEXT:    shrl $24, %ecx
-; X86-NOPOPCNT-NEXT:    movl %eax, %edx
-; X86-NOPOPCNT-NEXT:    shrl %edx
-; X86-NOPOPCNT-NEXT:    andl $1431655765, %edx # imm = 0x55555555
-; X86-NOPOPCNT-NEXT:    subl %edx, %eax
-; X86-NOPOPCNT-NEXT:    movl %eax, %edx
-; X86-NOPOPCNT-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NOPOPCNT-NEXT:    shrl $2, %eax
-; X86-NOPOPCNT-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NOPOPCNT-NEXT:    addl %edx, %eax
-; X86-NOPOPCNT-NEXT:    movl %eax, %edx
-; X86-NOPOPCNT-NEXT:    shrl $4, %edx
-; X86-NOPOPCNT-NEXT:    addl %eax, %edx
-; X86-NOPOPCNT-NEXT:    andl $17764111, %edx # imm = 0x10F0F0F
-; X86-NOPOPCNT-NEXT:    imull $16843009, %edx, %eax # imm = 0x1010101
-; X86-NOPOPCNT-NEXT:    shrl $24, %eax
-; X86-NOPOPCNT-NEXT:    addl %ecx, %eax
-; X86-NOPOPCNT-NEXT:    andl $1, %eax
+; X86-NOPOPCNT-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X86-NOPOPCNT-NEXT:    movl %eax, %ecx
+; X86-NOPOPCNT-NEXT:    shrl $16, %ecx
+; X86-NOPOPCNT-NEXT:    xorl %eax, %ecx
+; X86-NOPOPCNT-NEXT:    xorl %eax, %eax
+; X86-NOPOPCNT-NEXT:    xorb %ch, %cl
+; X86-NOPOPCNT-NEXT:    setnp %al
 ; X86-NOPOPCNT-NEXT:    xorl %edx, %edx
 ; X86-NOPOPCNT-NEXT:    retl
 ;
 ; X64-NOPOPCNT-LABEL: parity_64:
 ; X64-NOPOPCNT:       # %bb.0:
 ; X64-NOPOPCNT-NEXT:    movq %rdi, %rax
-; X64-NOPOPCNT-NEXT:    shrq %rax
-; X64-NOPOPCNT-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
-; X64-NOPOPCNT-NEXT:    andq %rax, %rcx
-; X64-NOPOPCNT-NEXT:    subq %rcx, %rdi
-; X64-NOPOPCNT-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; X64-NOPOPCNT-NEXT:    movq %rdi, %rcx
-; X64-NOPOPCNT-NEXT:    andq %rax, %rcx
-; X64-NOPOPCNT-NEXT:    shrq $2, %rdi
-; X64-NOPOPCNT-NEXT:    andq %rax, %rdi
-; X64-NOPOPCNT-NEXT:    addq %rcx, %rdi
-; X64-NOPOPCNT-NEXT:    movq %rdi, %rax
-; X64-NOPOPCNT-NEXT:    shrq $4, %rax
-; X64-NOPOPCNT-NEXT:    leaq (%rax,%rdi), %rax
-; X64-NOPOPCNT-NEXT:    movabsq $76296276040158991, %rcx # imm = 0x10F0F0F0F0F0F0F
-; X64-NOPOPCNT-NEXT:    andq %rax, %rcx
-; X64-NOPOPCNT-NEXT:    movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; X64-NOPOPCNT-NEXT:    imulq %rcx, %rax
-; X64-NOPOPCNT-NEXT:    shrq $56, %rax
-; X64-NOPOPCNT-NEXT:    andl $1, %eax
+; X64-NOPOPCNT-NEXT:    shrq $32, %rax
+; X64-NOPOPCNT-NEXT:    xorl %edi, %eax
+; X64-NOPOPCNT-NEXT:    movl %eax, %ecx
+; X64-NOPOPCNT-NEXT:    shrl $16, %ecx
+; X64-NOPOPCNT-NEXT:    xorl %eax, %ecx
+; X64-NOPOPCNT-NEXT:    movl %ecx, %edx
+; X64-NOPOPCNT-NEXT:    shrl $8, %edx
+; X64-NOPOPCNT-NEXT:    xorl %eax, %eax
+; X64-NOPOPCNT-NEXT:    xorb %cl, %dl
+; X64-NOPOPCNT-NEXT:    setnp %al
 ; X64-NOPOPCNT-NEXT:    retq
 ;
 ; X86-POPCNT-LABEL: parity_64:
 ; X86-POPCNT:       # %bb.0:
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %eax
-; X86-POPCNT-NEXT:    addl %ecx, %eax
+; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-POPCNT-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X86-POPCNT-NEXT:    popcntl %eax, %eax
 ; X86-POPCNT-NEXT:    andl $1, %eax
 ; X86-POPCNT-NEXT:    xorl %edx, %edx
 ; X86-POPCNT-NEXT:    retl