[llvm] [DAG] Add generic i8 CTPOP lowering using i32 MUL (PR #79989)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 1 03:25:18 PST 2024


https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/79989

>From 65b452ea62a789ea5b0a0f509ddab7dc45d4f500 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 30 Jan 2024 11:50:44 +0000
Subject: [PATCH] [DAG] Add generic i8 CTPOP lowering using i32 MUL

Limit this behind a TLI.shouldAllowMultiplyInBitCounts callback as so far only x86 really benefits from this

Fixes #79823
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |   6 +
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  20 +-
 llvm/lib/Target/X86/X86ISelLowering.h         |   4 +
 llvm/test/CodeGen/X86/ctpop-combine.ll        |  21 +-
 llvm/test/CodeGen/X86/popcnt.ll               | 701 +++++++++---------
 5 files changed, 373 insertions(+), 379 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index d39094aa7fed7..f4e9d7baf82b6 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3258,6 +3258,12 @@ class TargetLoweringBase {
     return false;
   }
 
+  /// Return true if CTPOP/CTTZ/CTLZ/PARITY expansions should try to use integer
+  /// multiples should the input value be suitable.
+  virtual bool shouldAllowMultiplyInBitCounts(EVT CntVT, EVT MulVT) const {
+    return false;
+  }
+
   // Should we fold (select_cc seteq (and x, y), 0, 0, A) -> (and (sra (shl x))
   // A) where y has a single bit set?
   virtual bool shouldFoldSelectWithSingleBitTest(EVT VT,
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 03b2a66989bd4..d0f0f4f6e89db 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8639,7 +8639,25 @@ SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const {
   if (VT.isVector() && !canExpandVectorCTPOP(*this, VT))
     return SDValue();
 
-  // This is the "best" algorithm from
+  // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
+  if (VT == MVT::i8 && shouldAllowMultiplyInBitCounts(MVT::i8, MVT::i32) &&
+      isOperationLegal(ISD::AND, MVT::i32) &&
+      isOperationLegal(ISD::SRL, MVT::i32) &&
+      isOperationLegal(ISD::MUL, MVT::i32)) {
+    SDValue Mask11 = DAG.getConstant(0x11111111U, dl, MVT::i32);
+    Op = DAG.getZExtOrTrunc(Op, dl, MVT::i32);
+    Op = DAG.getNode(ISD::MUL, dl, MVT::i32, Op,
+                     DAG.getConstant(0x08040201U, dl, MVT::i32));
+    Op = DAG.getNode(ISD::SRL, dl, MVT::i32, Op,
+                     DAG.getShiftAmountConstant(3, MVT::i32, dl));
+    Op = DAG.getNode(ISD::AND, dl, MVT::i32, Op, Mask11);
+    Op = DAG.getNode(ISD::MUL, dl, MVT::i32, Op, Mask11);
+    Op = DAG.getNode(ISD::SRL, dl, MVT::i32, Op,
+                     DAG.getShiftAmountConstant(28, MVT::i32, dl));
+    return DAG.getZExtOrTrunc(Op, dl, MVT::i8);
+  }
+
+  // This is the "best" fallback algorithm from
   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
   SDValue Mask55 =
       DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), dl, VT);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 32745400a38b7..c87e29dc46db9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1174,6 +1174,10 @@ namespace llvm {
 
     bool shouldSplatInsEltVarIndex(EVT VT) const override;
 
+    bool shouldAllowMultiplyInBitCounts(EVT CntVT, EVT MulVT) const override {
+      return CntVT.isScalarInteger() && isOperationLegal(ISD::MUL, MulVT);
+    }
+
     bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override {
       // Converting to sat variants holds little benefit on X86 as we will just
       // need to saturate the value back using fp arithmatic.
diff --git a/llvm/test/CodeGen/X86/ctpop-combine.ll b/llvm/test/CodeGen/X86/ctpop-combine.ll
index fba44218e0572..73152e9f909cf 100644
--- a/llvm/test/CodeGen/X86/ctpop-combine.ll
+++ b/llvm/test/CodeGen/X86/ctpop-combine.ll
@@ -88,20 +88,13 @@ define i8 @test4(i8 %x) nounwind readnone {
 ;
 ; NO-POPCOUNT-LABEL: test4:
 ; NO-POPCOUNT:       # %bb.0:
-; NO-POPCOUNT-NEXT:    movl %edi, %ecx
-; NO-POPCOUNT-NEXT:    andb $127, %cl
-; NO-POPCOUNT-NEXT:    shrb %dil
-; NO-POPCOUNT-NEXT:    andb $21, %dil
-; NO-POPCOUNT-NEXT:    subb %dil, %cl
-; NO-POPCOUNT-NEXT:    movl %ecx, %eax
-; NO-POPCOUNT-NEXT:    andb $51, %al
-; NO-POPCOUNT-NEXT:    shrb $2, %cl
-; NO-POPCOUNT-NEXT:    andb $51, %cl
-; NO-POPCOUNT-NEXT:    addb %al, %cl
-; NO-POPCOUNT-NEXT:    movl %ecx, %eax
-; NO-POPCOUNT-NEXT:    shrb $4, %al
-; NO-POPCOUNT-NEXT:    addb %cl, %al
-; NO-POPCOUNT-NEXT:    andb $15, %al
+; NO-POPCOUNT-NEXT:    andl $127, %edi
+; NO-POPCOUNT-NEXT:    imull $134480385, %edi, %eax # imm = 0x8040201
+; NO-POPCOUNT-NEXT:    shrl $3, %eax
+; NO-POPCOUNT-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; NO-POPCOUNT-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; NO-POPCOUNT-NEXT:    shrl $28, %eax
+; NO-POPCOUNT-NEXT:    # kill: def $al killed $al killed $eax
 ; NO-POPCOUNT-NEXT:    retq
   %x2 = and i8 %x, 127
   %count = tail call i8 @llvm.ctpop.i8(i8 %x2)
diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
index a9d77fd2c0a61..37c7b051de7b1 100644
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -1,46 +1,33 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefixes=X86,X86-NOSSE
-; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X86-POPCNT
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X64-POPCNT
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd | FileCheck %s --check-prefix=X64-NDD
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefixes=X64,X64-BASE
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+popcnt | FileCheck %s --check-prefixes=X86-POPCNT
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+popcnt | FileCheck %s --check-prefixes=X64-POPCNT
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd | FileCheck %s --check-prefixes=X64,X64-NDD
 ; RUN: llc < %s -mtriple=i686-unknown -mattr=sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2
 ; RUN: llc < %s -mtriple=i686-unknown -mattr=ssse3 | FileCheck %s --check-prefixes=X86,X86-SSSE3
 
 define i8 @cnt8(i8 %x) nounwind readnone {
 ; X86-LABEL: cnt8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb %al
-; X86-NEXT:    andb $85, %al
-; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andb $51, %al
-; X86-NEXT:    shrb $2, %cl
-; X86-NEXT:    andb $51, %cl
-; X86-NEXT:    addb %al, %cl
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $4, %al
-; X86-NEXT:    addb %cl, %al
-; X86-NEXT:    andb $15, %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X86-NEXT:    shrl $3, %eax
+; X86-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; X86-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X86-NEXT:    shrl $28, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cnt8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrb %al
-; X64-NEXT:    andb $85, %al
-; X64-NEXT:    subb %al, %dil
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    andb $51, %cl
-; X64-NEXT:    shrb $2, %dil
-; X64-NEXT:    andb $51, %dil
-; X64-NEXT:    addb %dil, %cl
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    shrb $4, %al
-; X64-NEXT:    addb %cl, %al
-; X64-NEXT:    andb $15, %al
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X64-NEXT:    shrl $3, %eax
+; X64-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; X64-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X64-NEXT:    shrl $28, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
 ; X86-POPCNT-LABEL: cnt8:
@@ -56,20 +43,6 @@ define i8 @cnt8(i8 %x) nounwind readnone {
 ; X64-POPCNT-NEXT:    popcntl %eax, %eax
 ; X64-POPCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-POPCNT-NEXT:    retq
-;
-; X64-NDD-LABEL: cnt8:
-; X64-NDD:       # %bb.0:
-; X64-NDD-NEXT:    shrb %dil, %al
-; X64-NDD-NEXT:    andb $85, %al
-; X64-NDD-NEXT:    subb %al, %dil, %al
-; X64-NDD-NEXT:    andb $51, %al, %cl
-; X64-NDD-NEXT:    shrb $2, %al
-; X64-NDD-NEXT:    andb $51, %al
-; X64-NDD-NEXT:    addb %cl, %al
-; X64-NDD-NEXT:    shrb $4, %al, %cl
-; X64-NDD-NEXT:    addb %cl, %al
-; X64-NDD-NEXT:    andb $15, %al
-; X64-NDD-NEXT:    retq
   %cnt = tail call i8 @llvm.ctpop.i8(i8 %x)
   ret i8 %cnt
 }
@@ -98,27 +71,27 @@ define i16 @cnt16(i16 %x) nounwind readnone {
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: cnt16:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrl %eax
-; X64-NEXT:    andl $21845, %eax # imm = 0x5555
-; X64-NEXT:    subl %eax, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andl $13107, %eax # imm = 0x3333
-; X64-NEXT:    shrl $2, %edi
-; X64-NEXT:    andl $13107, %edi # imm = 0x3333
-; X64-NEXT:    addl %eax, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrl $4, %eax
-; X64-NEXT:    addl %edi, %eax
-; X64-NEXT:    andl $3855, %eax # imm = 0xF0F
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    shrl $8, %ecx
-; X64-NEXT:    addl %eax, %ecx
-; X64-NEXT:    movzbl %cl, %eax
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    retq
+; X64-BASE-LABEL: cnt16:
+; X64-BASE:       # %bb.0:
+; X64-BASE-NEXT:    movl %edi, %eax
+; X64-BASE-NEXT:    shrl %eax
+; X64-BASE-NEXT:    andl $21845, %eax # imm = 0x5555
+; X64-BASE-NEXT:    subl %eax, %edi
+; X64-BASE-NEXT:    movl %edi, %eax
+; X64-BASE-NEXT:    andl $13107, %eax # imm = 0x3333
+; X64-BASE-NEXT:    shrl $2, %edi
+; X64-BASE-NEXT:    andl $13107, %edi # imm = 0x3333
+; X64-BASE-NEXT:    addl %eax, %edi
+; X64-BASE-NEXT:    movl %edi, %eax
+; X64-BASE-NEXT:    shrl $4, %eax
+; X64-BASE-NEXT:    addl %edi, %eax
+; X64-BASE-NEXT:    andl $3855, %eax # imm = 0xF0F
+; X64-BASE-NEXT:    movl %eax, %ecx
+; X64-BASE-NEXT:    shrl $8, %ecx
+; X64-BASE-NEXT:    addl %eax, %ecx
+; X64-BASE-NEXT:    movzbl %cl, %eax
+; X64-BASE-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-BASE-NEXT:    retq
 ;
 ; X86-POPCNT-LABEL: cnt16:
 ; X86-POPCNT:       # %bb.0:
@@ -176,24 +149,24 @@ define i32 @cnt32(i32 %x) nounwind readnone {
 ; X86-NEXT:    shrl $24, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: cnt32:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrl %eax
-; X64-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X64-NEXT:    subl %eax, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X64-NEXT:    shrl $2, %edi
-; X64-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X64-NEXT:    addl %eax, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrl $4, %eax
-; X64-NEXT:    addl %edi, %eax
-; X64-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X64-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
-; X64-NEXT:    shrl $24, %eax
-; X64-NEXT:    retq
+; X64-BASE-LABEL: cnt32:
+; X64-BASE:       # %bb.0:
+; X64-BASE-NEXT:    movl %edi, %eax
+; X64-BASE-NEXT:    shrl %eax
+; X64-BASE-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; X64-BASE-NEXT:    subl %eax, %edi
+; X64-BASE-NEXT:    movl %edi, %eax
+; X64-BASE-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X64-BASE-NEXT:    shrl $2, %edi
+; X64-BASE-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X64-BASE-NEXT:    addl %eax, %edi
+; X64-BASE-NEXT:    movl %edi, %eax
+; X64-BASE-NEXT:    shrl $4, %eax
+; X64-BASE-NEXT:    addl %edi, %eax
+; X64-BASE-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; X64-BASE-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
+; X64-BASE-NEXT:    shrl $24, %eax
+; X64-BASE-NEXT:    retq
 ;
 ; X86-POPCNT-LABEL: cnt32:
 ; X86-POPCNT:       # %bb.0:
@@ -263,28 +236,28 @@ define i64 @cnt64(i64 %x) nounwind readnone {
 ; X86-NOSSE-NEXT:    xorl %edx, %edx
 ; X86-NOSSE-NEXT:    retl
 ;
-; X64-LABEL: cnt64:
-; X64:       # %bb.0:
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shrq %rax
-; X64-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
-; X64-NEXT:    andq %rax, %rcx
-; X64-NEXT:    subq %rcx, %rdi
-; X64-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; X64-NEXT:    movq %rdi, %rcx
-; X64-NEXT:    andq %rax, %rcx
-; X64-NEXT:    shrq $2, %rdi
-; X64-NEXT:    andq %rdi, %rax
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    shrq $4, %rcx
-; X64-NEXT:    addq %rax, %rcx
-; X64-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT:    andq %rcx, %rdx
-; X64-NEXT:    movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; X64-NEXT:    imulq %rdx, %rax
-; X64-NEXT:    shrq $56, %rax
-; X64-NEXT:    retq
+; X64-BASE-LABEL: cnt64:
+; X64-BASE:       # %bb.0:
+; X64-BASE-NEXT:    movq %rdi, %rax
+; X64-BASE-NEXT:    shrq %rax
+; X64-BASE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; X64-BASE-NEXT:    andq %rax, %rcx
+; X64-BASE-NEXT:    subq %rcx, %rdi
+; X64-BASE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; X64-BASE-NEXT:    movq %rdi, %rcx
+; X64-BASE-NEXT:    andq %rax, %rcx
+; X64-BASE-NEXT:    shrq $2, %rdi
+; X64-BASE-NEXT:    andq %rdi, %rax
+; X64-BASE-NEXT:    addq %rcx, %rax
+; X64-BASE-NEXT:    movq %rax, %rcx
+; X64-BASE-NEXT:    shrq $4, %rcx
+; X64-BASE-NEXT:    addq %rax, %rcx
+; X64-BASE-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
+; X64-BASE-NEXT:    andq %rcx, %rdx
+; X64-BASE-NEXT:    movabsq $72340172838076673, %rax # imm = 0x101010101010101
+; X64-BASE-NEXT:    imulq %rdx, %rax
+; X64-BASE-NEXT:    shrq $56, %rax
+; X64-BASE-NEXT:    retq
 ;
 ; X86-POPCNT-LABEL: cnt64:
 ; X86-POPCNT:       # %bb.0:
@@ -447,45 +420,45 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-NOSSE-NEXT:    popl %ebx
 ; X86-NOSSE-NEXT:    retl $4
 ;
-; X64-LABEL: cnt128:
-; X64:       # %bb.0:
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    shrq %rax
-; X64-NEXT:    movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
-; X64-NEXT:    andq %r8, %rax
-; X64-NEXT:    subq %rax, %rsi
-; X64-NEXT:    movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    andq %rcx, %rax
-; X64-NEXT:    shrq $2, %rsi
-; X64-NEXT:    andq %rcx, %rsi
-; X64-NEXT:    addq %rsi, %rax
-; X64-NEXT:    movq %rax, %rdx
-; X64-NEXT:    shrq $4, %rdx
-; X64-NEXT:    addq %rax, %rdx
-; X64-NEXT:    movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT:    andq %rsi, %rdx
-; X64-NEXT:    movabsq $72340172838076673, %r9 # imm = 0x101010101010101
-; X64-NEXT:    imulq %r9, %rdx
-; X64-NEXT:    shrq $56, %rdx
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shrq %rax
-; X64-NEXT:    andq %r8, %rax
-; X64-NEXT:    subq %rax, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    andq %rcx, %rax
-; X64-NEXT:    shrq $2, %rdi
-; X64-NEXT:    andq %rdi, %rcx
-; X64-NEXT:    addq %rax, %rcx
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    shrq $4, %rax
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    andq %rsi, %rax
-; X64-NEXT:    imulq %r9, %rax
-; X64-NEXT:    shrq $56, %rax
-; X64-NEXT:    addq %rdx, %rax
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    retq
+; X64-BASE-LABEL: cnt128:
+; X64-BASE:       # %bb.0:
+; X64-BASE-NEXT:    movq %rsi, %rax
+; X64-BASE-NEXT:    shrq %rax
+; X64-BASE-NEXT:    movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
+; X64-BASE-NEXT:    andq %r8, %rax
+; X64-BASE-NEXT:    subq %rax, %rsi
+; X64-BASE-NEXT:    movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
+; X64-BASE-NEXT:    movq %rsi, %rax
+; X64-BASE-NEXT:    andq %rcx, %rax
+; X64-BASE-NEXT:    shrq $2, %rsi
+; X64-BASE-NEXT:    andq %rcx, %rsi
+; X64-BASE-NEXT:    addq %rsi, %rax
+; X64-BASE-NEXT:    movq %rax, %rdx
+; X64-BASE-NEXT:    shrq $4, %rdx
+; X64-BASE-NEXT:    addq %rax, %rdx
+; X64-BASE-NEXT:    movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
+; X64-BASE-NEXT:    andq %rsi, %rdx
+; X64-BASE-NEXT:    movabsq $72340172838076673, %r9 # imm = 0x101010101010101
+; X64-BASE-NEXT:    imulq %r9, %rdx
+; X64-BASE-NEXT:    shrq $56, %rdx
+; X64-BASE-NEXT:    movq %rdi, %rax
+; X64-BASE-NEXT:    shrq %rax
+; X64-BASE-NEXT:    andq %r8, %rax
+; X64-BASE-NEXT:    subq %rax, %rdi
+; X64-BASE-NEXT:    movq %rdi, %rax
+; X64-BASE-NEXT:    andq %rcx, %rax
+; X64-BASE-NEXT:    shrq $2, %rdi
+; X64-BASE-NEXT:    andq %rdi, %rcx
+; X64-BASE-NEXT:    addq %rax, %rcx
+; X64-BASE-NEXT:    movq %rcx, %rax
+; X64-BASE-NEXT:    shrq $4, %rax
+; X64-BASE-NEXT:    addq %rcx, %rax
+; X64-BASE-NEXT:    andq %rsi, %rax
+; X64-BASE-NEXT:    imulq %r9, %rax
+; X64-BASE-NEXT:    shrq $56, %rax
+; X64-BASE-NEXT:    addq %rdx, %rax
+; X64-BASE-NEXT:    xorl %edx, %edx
+; X64-BASE-NEXT:    retq
 ;
 ; X86-POPCNT-LABEL: cnt128:
 ; X86-POPCNT:       # %bb.0:
@@ -671,28 +644,28 @@ define i64 @cnt64_noimplicitfloat(i64 %x) nounwind readnone noimplicitfloat  {
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: cnt64_noimplicitfloat:
-; X64:       # %bb.0:
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shrq %rax
-; X64-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
-; X64-NEXT:    andq %rax, %rcx
-; X64-NEXT:    subq %rcx, %rdi
-; X64-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; X64-NEXT:    movq %rdi, %rcx
-; X64-NEXT:    andq %rax, %rcx
-; X64-NEXT:    shrq $2, %rdi
-; X64-NEXT:    andq %rdi, %rax
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    shrq $4, %rcx
-; X64-NEXT:    addq %rax, %rcx
-; X64-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT:    andq %rcx, %rdx
-; X64-NEXT:    movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; X64-NEXT:    imulq %rdx, %rax
-; X64-NEXT:    shrq $56, %rax
-; X64-NEXT:    retq
+; X64-BASE-LABEL: cnt64_noimplicitfloat:
+; X64-BASE:       # %bb.0:
+; X64-BASE-NEXT:    movq %rdi, %rax
+; X64-BASE-NEXT:    shrq %rax
+; X64-BASE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; X64-BASE-NEXT:    andq %rax, %rcx
+; X64-BASE-NEXT:    subq %rcx, %rdi
+; X64-BASE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; X64-BASE-NEXT:    movq %rdi, %rcx
+; X64-BASE-NEXT:    andq %rax, %rcx
+; X64-BASE-NEXT:    shrq $2, %rdi
+; X64-BASE-NEXT:    andq %rdi, %rax
+; X64-BASE-NEXT:    addq %rcx, %rax
+; X64-BASE-NEXT:    movq %rax, %rcx
+; X64-BASE-NEXT:    shrq $4, %rcx
+; X64-BASE-NEXT:    addq %rax, %rcx
+; X64-BASE-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
+; X64-BASE-NEXT:    andq %rcx, %rdx
+; X64-BASE-NEXT:    movabsq $72340172838076673, %rax # imm = 0x101010101010101
+; X64-BASE-NEXT:    imulq %rdx, %rax
+; X64-BASE-NEXT:    shrq $56, %rax
+; X64-BASE-NEXT:    retq
 ;
 ; X86-POPCNT-LABEL: cnt64_noimplicitfloat:
 ; X86-POPCNT:       # %bb.0:
@@ -752,25 +725,25 @@ define i32 @cnt32_optsize(i32 %x) nounwind readnone optsize {
 ; X86-NEXT:    shrl $24, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: cnt32_optsize:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrl %eax
-; X64-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X64-NEXT:    subl %eax, %edi
-; X64-NEXT:    movl $858993459, %eax # imm = 0x33333333
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    andl %eax, %ecx
-; X64-NEXT:    shrl $2, %edi
-; X64-NEXT:    andl %eax, %edi
-; X64-NEXT:    addl %ecx, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrl $4, %eax
-; X64-NEXT:    addl %edi, %eax
-; X64-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X64-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
-; X64-NEXT:    shrl $24, %eax
-; X64-NEXT:    retq
+; X64-BASE-LABEL: cnt32_optsize:
+; X64-BASE:       # %bb.0:
+; X64-BASE-NEXT:    movl %edi, %eax
+; X64-BASE-NEXT:    shrl %eax
+; X64-BASE-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; X64-BASE-NEXT:    subl %eax, %edi
+; X64-BASE-NEXT:    movl $858993459, %eax # imm = 0x33333333
+; X64-BASE-NEXT:    movl %edi, %ecx
+; X64-BASE-NEXT:    andl %eax, %ecx
+; X64-BASE-NEXT:    shrl $2, %edi
+; X64-BASE-NEXT:    andl %eax, %edi
+; X64-BASE-NEXT:    addl %ecx, %edi
+; X64-BASE-NEXT:    movl %edi, %eax
+; X64-BASE-NEXT:    shrl $4, %eax
+; X64-BASE-NEXT:    addl %edi, %eax
+; X64-BASE-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; X64-BASE-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
+; X64-BASE-NEXT:    shrl $24, %eax
+; X64-BASE-NEXT:    retq
 ;
 ; X86-POPCNT-LABEL: cnt32_optsize:
 ; X86-POPCNT:       # %bb.0:
@@ -850,28 +823,28 @@ define i64 @cnt64_optsize(i64 %x) nounwind readnone optsize {
 ; X86-NOSSE-NEXT:    popl %ebx
 ; X86-NOSSE-NEXT:    retl
 ;
-; X64-LABEL: cnt64_optsize:
-; X64:       # %bb.0:
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shrq %rax
-; X64-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
-; X64-NEXT:    andq %rax, %rcx
-; X64-NEXT:    subq %rcx, %rdi
-; X64-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; X64-NEXT:    movq %rdi, %rcx
-; X64-NEXT:    andq %rax, %rcx
-; X64-NEXT:    shrq $2, %rdi
-; X64-NEXT:    andq %rdi, %rax
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    shrq $4, %rcx
-; X64-NEXT:    addq %rax, %rcx
-; X64-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT:    andq %rcx, %rdx
-; X64-NEXT:    movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; X64-NEXT:    imulq %rdx, %rax
-; X64-NEXT:    shrq $56, %rax
-; X64-NEXT:    retq
+; X64-BASE-LABEL: cnt64_optsize:
+; X64-BASE:       # %bb.0:
+; X64-BASE-NEXT:    movq %rdi, %rax
+; X64-BASE-NEXT:    shrq %rax
+; X64-BASE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; X64-BASE-NEXT:    andq %rax, %rcx
+; X64-BASE-NEXT:    subq %rcx, %rdi
+; X64-BASE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; X64-BASE-NEXT:    movq %rdi, %rcx
+; X64-BASE-NEXT:    andq %rax, %rcx
+; X64-BASE-NEXT:    shrq $2, %rdi
+; X64-BASE-NEXT:    andq %rdi, %rax
+; X64-BASE-NEXT:    addq %rcx, %rax
+; X64-BASE-NEXT:    movq %rax, %rcx
+; X64-BASE-NEXT:    shrq $4, %rcx
+; X64-BASE-NEXT:    addq %rax, %rcx
+; X64-BASE-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
+; X64-BASE-NEXT:    andq %rcx, %rdx
+; X64-BASE-NEXT:    movabsq $72340172838076673, %rax # imm = 0x101010101010101
+; X64-BASE-NEXT:    imulq %rdx, %rax
+; X64-BASE-NEXT:    shrq $56, %rax
+; X64-BASE-NEXT:    retq
 ;
 ; X86-POPCNT-LABEL: cnt64_optsize:
 ; X86-POPCNT:       # %bb.0:
@@ -1042,45 +1015,45 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl $4
 ;
-; X64-LABEL: cnt128_optsize:
-; X64:       # %bb.0:
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    shrq %rax
-; X64-NEXT:    movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
-; X64-NEXT:    andq %r8, %rax
-; X64-NEXT:    subq %rax, %rsi
-; X64-NEXT:    movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    andq %rcx, %rax
-; X64-NEXT:    shrq $2, %rsi
-; X64-NEXT:    andq %rcx, %rsi
-; X64-NEXT:    addq %rsi, %rax
-; X64-NEXT:    movq %rax, %rdx
-; X64-NEXT:    shrq $4, %rdx
-; X64-NEXT:    addq %rax, %rdx
-; X64-NEXT:    movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT:    andq %rsi, %rdx
-; X64-NEXT:    movabsq $72340172838076673, %r9 # imm = 0x101010101010101
-; X64-NEXT:    imulq %r9, %rdx
-; X64-NEXT:    shrq $56, %rdx
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shrq %rax
-; X64-NEXT:    andq %r8, %rax
-; X64-NEXT:    subq %rax, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    andq %rcx, %rax
-; X64-NEXT:    shrq $2, %rdi
-; X64-NEXT:    andq %rdi, %rcx
-; X64-NEXT:    addq %rax, %rcx
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    shrq $4, %rax
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    andq %rsi, %rax
-; X64-NEXT:    imulq %r9, %rax
-; X64-NEXT:    shrq $56, %rax
-; X64-NEXT:    addq %rdx, %rax
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    retq
+; X64-BASE-LABEL: cnt128_optsize:
+; X64-BASE:       # %bb.0:
+; X64-BASE-NEXT:    movq %rsi, %rax
+; X64-BASE-NEXT:    shrq %rax
+; X64-BASE-NEXT:    movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
+; X64-BASE-NEXT:    andq %r8, %rax
+; X64-BASE-NEXT:    subq %rax, %rsi
+; X64-BASE-NEXT:    movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
+; X64-BASE-NEXT:    movq %rsi, %rax
+; X64-BASE-NEXT:    andq %rcx, %rax
+; X64-BASE-NEXT:    shrq $2, %rsi
+; X64-BASE-NEXT:    andq %rcx, %rsi
+; X64-BASE-NEXT:    addq %rsi, %rax
+; X64-BASE-NEXT:    movq %rax, %rdx
+; X64-BASE-NEXT:    shrq $4, %rdx
+; X64-BASE-NEXT:    addq %rax, %rdx
+; X64-BASE-NEXT:    movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
+; X64-BASE-NEXT:    andq %rsi, %rdx
+; X64-BASE-NEXT:    movabsq $72340172838076673, %r9 # imm = 0x101010101010101
+; X64-BASE-NEXT:    imulq %r9, %rdx
+; X64-BASE-NEXT:    shrq $56, %rdx
+; X64-BASE-NEXT:    movq %rdi, %rax
+; X64-BASE-NEXT:    shrq %rax
+; X64-BASE-NEXT:    andq %r8, %rax
+; X64-BASE-NEXT:    subq %rax, %rdi
+; X64-BASE-NEXT:    movq %rdi, %rax
+; X64-BASE-NEXT:    andq %rcx, %rax
+; X64-BASE-NEXT:    shrq $2, %rdi
+; X64-BASE-NEXT:    andq %rdi, %rcx
+; X64-BASE-NEXT:    addq %rax, %rcx
+; X64-BASE-NEXT:    movq %rcx, %rax
+; X64-BASE-NEXT:    shrq $4, %rax
+; X64-BASE-NEXT:    addq %rcx, %rax
+; X64-BASE-NEXT:    andq %rsi, %rax
+; X64-BASE-NEXT:    imulq %r9, %rax
+; X64-BASE-NEXT:    shrq $56, %rax
+; X64-BASE-NEXT:    addq %rdx, %rax
+; X64-BASE-NEXT:    xorl %edx, %edx
+; X64-BASE-NEXT:    retq
 ;
 ; X86-POPCNT-LABEL: cnt128_optsize:
 ; X86-POPCNT:       # %bb.0:
@@ -1251,24 +1224,24 @@ define i32 @cnt32_pgso(i32 %x) nounwind readnone !prof !14 {
 ; X86-NEXT:    shrl $24, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: cnt32_pgso:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrl %eax
-; X64-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X64-NEXT:    subl %eax, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X64-NEXT:    shrl $2, %edi
-; X64-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X64-NEXT:    addl %eax, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrl $4, %eax
-; X64-NEXT:    addl %edi, %eax
-; X64-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X64-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
-; X64-NEXT:    shrl $24, %eax
-; X64-NEXT:    retq
+; X64-BASE-LABEL: cnt32_pgso:
+; X64-BASE:       # %bb.0:
+; X64-BASE-NEXT:    movl %edi, %eax
+; X64-BASE-NEXT:    shrl %eax
+; X64-BASE-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; X64-BASE-NEXT:    subl %eax, %edi
+; X64-BASE-NEXT:    movl %edi, %eax
+; X64-BASE-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X64-BASE-NEXT:    shrl $2, %edi
+; X64-BASE-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X64-BASE-NEXT:    addl %eax, %edi
+; X64-BASE-NEXT:    movl %edi, %eax
+; X64-BASE-NEXT:    shrl $4, %eax
+; X64-BASE-NEXT:    addl %edi, %eax
+; X64-BASE-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; X64-BASE-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
+; X64-BASE-NEXT:    shrl $24, %eax
+; X64-BASE-NEXT:    retq
 ;
 ; X86-POPCNT-LABEL: cnt32_pgso:
 ; X86-POPCNT:       # %bb.0:
@@ -1338,28 +1311,28 @@ define i64 @cnt64_pgso(i64 %x) nounwind readnone !prof !14 {
 ; X86-NOSSE-NEXT:    xorl %edx, %edx
 ; X86-NOSSE-NEXT:    retl
 ;
-; X64-LABEL: cnt64_pgso:
-; X64:       # %bb.0:
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shrq %rax
-; X64-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
-; X64-NEXT:    andq %rax, %rcx
-; X64-NEXT:    subq %rcx, %rdi
-; X64-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; X64-NEXT:    movq %rdi, %rcx
-; X64-NEXT:    andq %rax, %rcx
-; X64-NEXT:    shrq $2, %rdi
-; X64-NEXT:    andq %rdi, %rax
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    shrq $4, %rcx
-; X64-NEXT:    addq %rax, %rcx
-; X64-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT:    andq %rcx, %rdx
-; X64-NEXT:    movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; X64-NEXT:    imulq %rdx, %rax
-; X64-NEXT:    shrq $56, %rax
-; X64-NEXT:    retq
+; X64-BASE-LABEL: cnt64_pgso:
+; X64-BASE:       # %bb.0:
+; X64-BASE-NEXT:    movq %rdi, %rax
+; X64-BASE-NEXT:    shrq %rax
+; X64-BASE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; X64-BASE-NEXT:    andq %rax, %rcx
+; X64-BASE-NEXT:    subq %rcx, %rdi
+; X64-BASE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; X64-BASE-NEXT:    movq %rdi, %rcx
+; X64-BASE-NEXT:    andq %rax, %rcx
+; X64-BASE-NEXT:    shrq $2, %rdi
+; X64-BASE-NEXT:    andq %rdi, %rax
+; X64-BASE-NEXT:    addq %rcx, %rax
+; X64-BASE-NEXT:    movq %rax, %rcx
+; X64-BASE-NEXT:    shrq $4, %rcx
+; X64-BASE-NEXT:    addq %rax, %rcx
+; X64-BASE-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
+; X64-BASE-NEXT:    andq %rcx, %rdx
+; X64-BASE-NEXT:    movabsq $72340172838076673, %rax # imm = 0x101010101010101
+; X64-BASE-NEXT:    imulq %rdx, %rax
+; X64-BASE-NEXT:    shrq $56, %rax
+; X64-BASE-NEXT:    retq
 ;
 ; X86-POPCNT-LABEL: cnt64_pgso:
 ; X86-POPCNT:       # %bb.0:
@@ -1523,45 +1496,45 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-NOSSE-NEXT:    popl %ebx
 ; X86-NOSSE-NEXT:    retl $4
 ;
-; X64-LABEL: cnt128_pgso:
-; X64:       # %bb.0:
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    shrq %rax
-; X64-NEXT:    movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
-; X64-NEXT:    andq %r8, %rax
-; X64-NEXT:    subq %rax, %rsi
-; X64-NEXT:    movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    andq %rcx, %rax
-; X64-NEXT:    shrq $2, %rsi
-; X64-NEXT:    andq %rcx, %rsi
-; X64-NEXT:    addq %rsi, %rax
-; X64-NEXT:    movq %rax, %rdx
-; X64-NEXT:    shrq $4, %rdx
-; X64-NEXT:    addq %rax, %rdx
-; X64-NEXT:    movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT:    andq %rsi, %rdx
-; X64-NEXT:    movabsq $72340172838076673, %r9 # imm = 0x101010101010101
-; X64-NEXT:    imulq %r9, %rdx
-; X64-NEXT:    shrq $56, %rdx
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shrq %rax
-; X64-NEXT:    andq %r8, %rax
-; X64-NEXT:    subq %rax, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    andq %rcx, %rax
-; X64-NEXT:    shrq $2, %rdi
-; X64-NEXT:    andq %rdi, %rcx
-; X64-NEXT:    addq %rax, %rcx
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    shrq $4, %rax
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    andq %rsi, %rax
-; X64-NEXT:    imulq %r9, %rax
-; X64-NEXT:    shrq $56, %rax
-; X64-NEXT:    addq %rdx, %rax
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    retq
+; X64-BASE-LABEL: cnt128_pgso:
+; X64-BASE:       # %bb.0:
+; X64-BASE-NEXT:    movq %rsi, %rax
+; X64-BASE-NEXT:    shrq %rax
+; X64-BASE-NEXT:    movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
+; X64-BASE-NEXT:    andq %r8, %rax
+; X64-BASE-NEXT:    subq %rax, %rsi
+; X64-BASE-NEXT:    movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
+; X64-BASE-NEXT:    movq %rsi, %rax
+; X64-BASE-NEXT:    andq %rcx, %rax
+; X64-BASE-NEXT:    shrq $2, %rsi
+; X64-BASE-NEXT:    andq %rcx, %rsi
+; X64-BASE-NEXT:    addq %rsi, %rax
+; X64-BASE-NEXT:    movq %rax, %rdx
+; X64-BASE-NEXT:    shrq $4, %rdx
+; X64-BASE-NEXT:    addq %rax, %rdx
+; X64-BASE-NEXT:    movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
+; X64-BASE-NEXT:    andq %rsi, %rdx
+; X64-BASE-NEXT:    movabsq $72340172838076673, %r9 # imm = 0x101010101010101
+; X64-BASE-NEXT:    imulq %r9, %rdx
+; X64-BASE-NEXT:    shrq $56, %rdx
+; X64-BASE-NEXT:    movq %rdi, %rax
+; X64-BASE-NEXT:    shrq %rax
+; X64-BASE-NEXT:    andq %r8, %rax
+; X64-BASE-NEXT:    subq %rax, %rdi
+; X64-BASE-NEXT:    movq %rdi, %rax
+; X64-BASE-NEXT:    andq %rcx, %rax
+; X64-BASE-NEXT:    shrq $2, %rdi
+; X64-BASE-NEXT:    andq %rdi, %rcx
+; X64-BASE-NEXT:    addq %rax, %rcx
+; X64-BASE-NEXT:    movq %rcx, %rax
+; X64-BASE-NEXT:    shrq $4, %rax
+; X64-BASE-NEXT:    addq %rcx, %rax
+; X64-BASE-NEXT:    andq %rsi, %rax
+; X64-BASE-NEXT:    imulq %r9, %rax
+; X64-BASE-NEXT:    shrq $56, %rax
+; X64-BASE-NEXT:    addq %rdx, %rax
+; X64-BASE-NEXT:    xorl %edx, %edx
+; X64-BASE-NEXT:    retq
 ;
 ; X86-POPCNT-LABEL: cnt128_pgso:
 ; X86-POPCNT:       # %bb.0:
@@ -1732,24 +1705,24 @@ define i32 @popcount_zext_i32(i16 zeroext %x) {
 ; X86-NEXT:    shrl $24, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: popcount_zext_i32:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrl %eax
-; X64-NEXT:    andl $21845, %eax # imm = 0x5555
-; X64-NEXT:    subl %eax, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X64-NEXT:    shrl $2, %edi
-; X64-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X64-NEXT:    addl %eax, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrl $4, %eax
-; X64-NEXT:    addl %edi, %eax
-; X64-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X64-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
-; X64-NEXT:    shrl $24, %eax
-; X64-NEXT:    retq
+; X64-BASE-LABEL: popcount_zext_i32:
+; X64-BASE:       # %bb.0:
+; X64-BASE-NEXT:    movl %edi, %eax
+; X64-BASE-NEXT:    shrl %eax
+; X64-BASE-NEXT:    andl $21845, %eax # imm = 0x5555
+; X64-BASE-NEXT:    subl %eax, %edi
+; X64-BASE-NEXT:    movl %edi, %eax
+; X64-BASE-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X64-BASE-NEXT:    shrl $2, %edi
+; X64-BASE-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X64-BASE-NEXT:    addl %eax, %edi
+; X64-BASE-NEXT:    movl %edi, %eax
+; X64-BASE-NEXT:    shrl $4, %eax
+; X64-BASE-NEXT:    addl %edi, %eax
+; X64-BASE-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; X64-BASE-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
+; X64-BASE-NEXT:    shrl $24, %eax
+; X64-BASE-NEXT:    retq
 ;
 ; X86-POPCNT-LABEL: popcount_zext_i32:
 ; X86-POPCNT:       # %bb.0:
@@ -1805,26 +1778,26 @@ define i32 @popcount_i16_zext(i16 zeroext %x) {
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: popcount_i16_zext:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrl %eax
-; X64-NEXT:    andl $21845, %eax # imm = 0x5555
-; X64-NEXT:    subl %eax, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andl $13107, %eax # imm = 0x3333
-; X64-NEXT:    shrl $2, %edi
-; X64-NEXT:    andl $13107, %edi # imm = 0x3333
-; X64-NEXT:    addl %eax, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrl $4, %eax
-; X64-NEXT:    addl %edi, %eax
-; X64-NEXT:    andl $3855, %eax # imm = 0xF0F
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    shrl $8, %ecx
-; X64-NEXT:    addl %eax, %ecx
-; X64-NEXT:    movzbl %cl, %eax
-; X64-NEXT:    retq
+; X64-BASE-LABEL: popcount_i16_zext:
+; X64-BASE:       # %bb.0:
+; X64-BASE-NEXT:    movl %edi, %eax
+; X64-BASE-NEXT:    shrl %eax
+; X64-BASE-NEXT:    andl $21845, %eax # imm = 0x5555
+; X64-BASE-NEXT:    subl %eax, %edi
+; X64-BASE-NEXT:    movl %edi, %eax
+; X64-BASE-NEXT:    andl $13107, %eax # imm = 0x3333
+; X64-BASE-NEXT:    shrl $2, %edi
+; X64-BASE-NEXT:    andl $13107, %edi # imm = 0x3333
+; X64-BASE-NEXT:    addl %eax, %edi
+; X64-BASE-NEXT:    movl %edi, %eax
+; X64-BASE-NEXT:    shrl $4, %eax
+; X64-BASE-NEXT:    addl %edi, %eax
+; X64-BASE-NEXT:    andl $3855, %eax # imm = 0xF0F
+; X64-BASE-NEXT:    movl %eax, %ecx
+; X64-BASE-NEXT:    shrl $8, %ecx
+; X64-BASE-NEXT:    addl %eax, %ecx
+; X64-BASE-NEXT:    movzbl %cl, %eax
+; X64-BASE-NEXT:    retq
 ;
 ; X86-POPCNT-LABEL: popcount_i16_zext:
 ; X86-POPCNT:       # %bb.0:



More information about the llvm-commits mailing list