[llvm] 98a07f7 - [X86] LowerCTPOP - "ctpop(i2 x) --> sub(x, (x >> 1))"

Wed Feb 21 05:55:45 PST 2024

Author: Simon Pilgrim
Date: 2024-02-21T13:53:47Z
New Revision: 98a07f72eefb43476ca9e7af3178879d6ef71464

URL: https://github.com/llvm/llvm-project/commit/98a07f72eefb43476ca9e7af3178879d6ef71464
DIFF: https://github.com/llvm/llvm-project/commit/98a07f72eefb43476ca9e7af3178879d6ef71464.diff

LOG: [X86] LowerCTPOP - "ctpop(i2 x) --> sub(x, (x >> 1))"

If we only have 2 active bits then we can avoid the i8 CTPOP multiply expansion entirely

Another expansion pulled from #79823

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/ctpop-mask.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ac2d1c76980adf..834b470a4a867c 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31053,6 +31053,18 @@ static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
     unsigned ActiveBits = Known.getBitWidth() - LZ;
     unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
 
+    // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
+    if (ShiftedActiveBits <= 2) {
+      if (ActiveBits > 2)
+        Op = DAG.getNode(ISD::SRL, DL, VT, Op,
+                         DAG.getShiftAmountConstant(TZ, VT, DL));
+      Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
+      Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
+                       DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
+                                   DAG.getShiftAmountConstant(1, VT, DL)));
+      return DAG.getZExtOrTrunc(Op, DL, VT);
+    }
+
     // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
     if (ShiftedActiveBits <= 8) {
       SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);

diff  --git a/llvm/test/CodeGen/X86/ctpop-mask.ll b/llvm/test/CodeGen/X86/ctpop-mask.ll
index 602d9b511cdc06..4b03563fd9924a 100644
--- a/llvm/test/CodeGen/X86/ctpop-mask.ll
+++ b/llvm/test/CodeGen/X86/ctpop-mask.ll
@@ -33,22 +33,19 @@ define i64 @ctpop_mask2(i64 %x) nounwind readnone {
 ; X86-NO-POPCOUNT:       # %bb.0:
 ; X86-NO-POPCOUNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-POPCOUNT-NEXT:    andl $3, %eax
-; X86-NO-POPCOUNT-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
-; X86-NO-POPCOUNT-NEXT:    shrl $3, %eax
-; X86-NO-POPCOUNT-NEXT:    andl $17895697, %eax # imm = 0x1111111
-; X86-NO-POPCOUNT-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
-; X86-NO-POPCOUNT-NEXT:    shrl $28, %eax
+; X86-NO-POPCOUNT-NEXT:    movl %eax, %ecx
+; X86-NO-POPCOUNT-NEXT:    shrl %ecx
+; X86-NO-POPCOUNT-NEXT:    subl %ecx, %eax
 ; X86-NO-POPCOUNT-NEXT:    xorl %edx, %edx
 ; X86-NO-POPCOUNT-NEXT:    retl
 ;
 ; X64-NO-POPCOUNT-LABEL: ctpop_mask2:
 ; X64-NO-POPCOUNT:       # %bb.0:
-; X64-NO-POPCOUNT-NEXT:    andl $3, %edi
-; X64-NO-POPCOUNT-NEXT:    imull $134480385, %edi, %eax # imm = 0x8040201
-; X64-NO-POPCOUNT-NEXT:    shrl $3, %eax
-; X64-NO-POPCOUNT-NEXT:    andl $17895697, %eax # imm = 0x1111111
-; X64-NO-POPCOUNT-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
-; X64-NO-POPCOUNT-NEXT:    shrl $28, %eax
+; X64-NO-POPCOUNT-NEXT:    movq %rdi, %rax
+; X64-NO-POPCOUNT-NEXT:    andl $3, %eax
+; X64-NO-POPCOUNT-NEXT:    movl %eax, %ecx
+; X64-NO-POPCOUNT-NEXT:    shrl %ecx
+; X64-NO-POPCOUNT-NEXT:    subl %ecx, %eax
 ; X64-NO-POPCOUNT-NEXT:    retq
   %mask = and i64 %x, 3
   %count = tail call i64 @llvm.ctpop.i64(i64 %mask)
@@ -71,25 +68,22 @@ define i32 @ctpop_shifted_mask2(i32 %x) nounwind readnone {
 ;
 ; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask2:
 ; X86-NO-POPCOUNT:       # %bb.0:
-; X86-NO-POPCOUNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-POPCOUNT-NEXT:    movl $1572864, %eax # imm = 0x180000
+; X86-NO-POPCOUNT-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NO-POPCOUNT-NEXT:    movl %eax, %ecx
+; X86-NO-POPCOUNT-NEXT:    shrl $20, %ecx
 ; X86-NO-POPCOUNT-NEXT:    shrl $19, %eax
-; X86-NO-POPCOUNT-NEXT:    andl $3, %eax
-; X86-NO-POPCOUNT-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
-; X86-NO-POPCOUNT-NEXT:    shrl $3, %eax
-; X86-NO-POPCOUNT-NEXT:    andl $17895697, %eax # imm = 0x1111111
-; X86-NO-POPCOUNT-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
-; X86-NO-POPCOUNT-NEXT:    shrl $28, %eax
+; X86-NO-POPCOUNT-NEXT:    subl %ecx, %eax
 ; X86-NO-POPCOUNT-NEXT:    retl
 ;
 ; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask2:
 ; X64-NO-POPCOUNT:       # %bb.0:
-; X64-NO-POPCOUNT-NEXT:    shrl $19, %edi
-; X64-NO-POPCOUNT-NEXT:    andl $3, %edi
-; X64-NO-POPCOUNT-NEXT:    imull $134480385, %edi, %eax # imm = 0x8040201
-; X64-NO-POPCOUNT-NEXT:    shrl $3, %eax
-; X64-NO-POPCOUNT-NEXT:    andl $17895697, %eax # imm = 0x1111111
-; X64-NO-POPCOUNT-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
-; X64-NO-POPCOUNT-NEXT:    shrl $28, %eax
+; X64-NO-POPCOUNT-NEXT:    movl %edi, %eax
+; X64-NO-POPCOUNT-NEXT:    andl $1572864, %eax # imm = 0x180000
+; X64-NO-POPCOUNT-NEXT:    movl %eax, %ecx
+; X64-NO-POPCOUNT-NEXT:    shrl $20, %ecx
+; X64-NO-POPCOUNT-NEXT:    shrl $19, %eax
+; X64-NO-POPCOUNT-NEXT:    subl %ecx, %eax
 ; X64-NO-POPCOUNT-NEXT:    retq
   %mask = and i32 %x, 1572864 ; 3 << 19
   %count = tail call i32 @llvm.ctpop.i32(i32 %mask)