[llvm] r356808 - [X86] Use xmm registers to implement 64-bit popcnt on 32-bit targets if possible if popcnt instruction is not available
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 22 13:47:02 PDT 2019
Author: ctopper
Date: Fri Mar 22 13:47:02 2019
New Revision: 356808
URL: http://llvm.org/viewvc/llvm-project?rev=356808&view=rev
Log:
[X86] Use xmm registers to implement 64-bit popcnt on 32-bit targets if possible if popcnt instruction is not available
On 32-bit targets without popcnt, we currently expand 64-bit popcnt to sequences of arithmetic and logic ops for each 32-bit half and then add the 32 bit halves together. If we have xmm registers we can use use those to implement the operation instead. This results in less instructions then doing two separate 32-bit popcnt sequences.
This mitigates some of PR41151 for the i64 on i686 case when we have SSE2.
Differential Revision: https://reviews.llvm.org/D59662
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/popcnt.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=356808&r1=356807&r2=356808&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Mar 22 13:47:02 2019
@@ -414,6 +414,8 @@ X86TargetLowering::X86TargetLowering(con
setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
if (Subtarget.is64Bit())
setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
+ else
+ setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
}
setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
@@ -26715,6 +26717,26 @@ void X86TargetLowering::ReplaceNodeResul
switch (N->getOpcode()) {
default:
llvm_unreachable("Do not know how to custom type legalize this operation!");
+ case ISD::CTPOP: {
+ assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
+ // Use a v2i64 if possible.
+ bool NoImplicitFloatOps =
+ DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat);
+ if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
+ SDValue Wide =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
+ Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
+ // Bit count should fit in 32-bits, extract it as that and then zero
+ // extend to i64. Otherwise we end up extracting bits 63:32 separately.
+ Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
+ Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
+ DAG.getIntPtrConstant(0, dl));
+ Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
+ Results.push_back(Wide);
+ }
+ return;
+ }
case ISD::MUL: {
EVT VT = N->getValueType(0);
assert(VT.isVector() && "Unexpected VT");
Modified: llvm/trunk/test/CodeGen/X86/popcnt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/popcnt.ll?rev=356808&r1=356807&r2=356808&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/popcnt.ll (original)
+++ llvm/trunk/test/CodeGen/X86/popcnt.ll Fri Mar 22 13:47:02 2019
@@ -1,8 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefixes=X32,X32-NOSSE
; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
; RUN: llc < %s -mtriple=i686-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X32-POPCNT
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X64-POPCNT
+; RUN: llc < %s -mtriple=i686-unknown -mattr=sse2 | FileCheck %s --check-prefixes=X32,X32-SSE2
+; RUN: llc < %s -mtriple=i686-unknown -mattr=ssse3 | FileCheck %s --check-prefixes=X32,X32-SSSE3
define i8 @cnt8(i8 %x) nounwind readnone {
; X32-LABEL: cnt8:
@@ -172,7 +174,127 @@ define i32 @cnt32(i32 %x) nounwind readn
}
define i64 @cnt64(i64 %x) nounwind readnone {
-; X32-LABEL: cnt64:
+; X32-NOSSE-LABEL: cnt64:
+; X32-NOSSE: # %bb.0:
+; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NOSSE-NEXT: movl %ecx, %edx
+; X32-NOSSE-NEXT: shrl %edx
+; X32-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555
+; X32-NOSSE-NEXT: subl %edx, %ecx
+; X32-NOSSE-NEXT: movl %ecx, %edx
+; X32-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333
+; X32-NOSSE-NEXT: shrl $2, %ecx
+; X32-NOSSE-NEXT: andl $858993459, %ecx # imm = 0x33333333
+; X32-NOSSE-NEXT: addl %edx, %ecx
+; X32-NOSSE-NEXT: movl %ecx, %edx
+; X32-NOSSE-NEXT: shrl $4, %edx
+; X32-NOSSE-NEXT: addl %ecx, %edx
+; X32-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
+; X32-NOSSE-NEXT: imull $16843009, %edx, %ecx # imm = 0x1010101
+; X32-NOSSE-NEXT: shrl $24, %ecx
+; X32-NOSSE-NEXT: movl %eax, %edx
+; X32-NOSSE-NEXT: shrl %edx
+; X32-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555
+; X32-NOSSE-NEXT: subl %edx, %eax
+; X32-NOSSE-NEXT: movl %eax, %edx
+; X32-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333
+; X32-NOSSE-NEXT: shrl $2, %eax
+; X32-NOSSE-NEXT: andl $858993459, %eax # imm = 0x33333333
+; X32-NOSSE-NEXT: addl %edx, %eax
+; X32-NOSSE-NEXT: movl %eax, %edx
+; X32-NOSSE-NEXT: shrl $4, %edx
+; X32-NOSSE-NEXT: addl %eax, %edx
+; X32-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
+; X32-NOSSE-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101
+; X32-NOSSE-NEXT: shrl $24, %eax
+; X32-NOSSE-NEXT: addl %ecx, %eax
+; X32-NOSSE-NEXT: xorl %edx, %edx
+; X32-NOSSE-NEXT: retl
+;
+; X64-LABEL: cnt64:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: shrq %rax
+; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: subq %rcx, %rdi
+; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; X64-NEXT: movq %rdi, %rcx
+; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: shrq $2, %rdi
+; X64-NEXT: andq %rax, %rdi
+; X64-NEXT: addq %rcx, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: shrq $4, %rax
+; X64-NEXT: leaq (%rax,%rdi), %rax
+; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
+; X64-NEXT: imulq %rcx, %rax
+; X64-NEXT: shrq $56, %rax
+; X64-NEXT: retq
+;
+; X32-POPCNT-LABEL: cnt64:
+; X32-POPCNT: # %bb.0:
+; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
+; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax
+; X32-POPCNT-NEXT: addl %ecx, %eax
+; X32-POPCNT-NEXT: xorl %edx, %edx
+; X32-POPCNT-NEXT: retl
+;
+; X64-POPCNT-LABEL: cnt64:
+; X64-POPCNT: # %bb.0:
+; X64-POPCNT-NEXT: popcntq %rdi, %rax
+; X64-POPCNT-NEXT: retq
+;
+; X32-SSE2-LABEL: cnt64:
+; X32-SSE2: # %bb.0:
+; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X32-SSE2-NEXT: psrlw $1, %xmm1
+; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1
+; X32-SSE2-NEXT: psubb %xmm1, %xmm0
+; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; X32-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X32-SSE2-NEXT: pand %xmm1, %xmm2
+; X32-SSE2-NEXT: psrlw $2, %xmm0
+; X32-SSE2-NEXT: pand %xmm1, %xmm0
+; X32-SSE2-NEXT: paddb %xmm2, %xmm0
+; X32-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X32-SSE2-NEXT: psrlw $4, %xmm1
+; X32-SSE2-NEXT: paddb %xmm0, %xmm1
+; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1
+; X32-SSE2-NEXT: pxor %xmm0, %xmm0
+; X32-SSE2-NEXT: psadbw %xmm1, %xmm0
+; X32-SSE2-NEXT: movd %xmm0, %eax
+; X32-SSE2-NEXT: xorl %edx, %edx
+; X32-SSE2-NEXT: retl
+;
+; X32-SSSE3-LABEL: cnt64:
+; X32-SSSE3: # %bb.0:
+; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X32-SSSE3-NEXT: movdqa %xmm1, %xmm2
+; X32-SSSE3-NEXT: pand %xmm0, %xmm2
+; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSSE3-NEXT: movdqa %xmm3, %xmm4
+; X32-SSSE3-NEXT: pshufb %xmm2, %xmm4
+; X32-SSSE3-NEXT: psrlw $4, %xmm1
+; X32-SSSE3-NEXT: pand %xmm0, %xmm1
+; X32-SSSE3-NEXT: pshufb %xmm1, %xmm3
+; X32-SSSE3-NEXT: paddb %xmm4, %xmm3
+; X32-SSSE3-NEXT: pxor %xmm0, %xmm0
+; X32-SSSE3-NEXT: psadbw %xmm3, %xmm0
+; X32-SSSE3-NEXT: movd %xmm0, %eax
+; X32-SSSE3-NEXT: xorl %edx, %edx
+; X32-SSSE3-NEXT: retl
+ %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
+ ret i64 %cnt
+}
+
+define i64 @cnt64_noimplicitfloat(i64 %x) nounwind readnone noimplicitfloat {
+; X32-LABEL: cnt64_noimplicitfloat:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -210,7 +332,7 @@ define i64 @cnt64(i64 %x) nounwind readn
; X32-NEXT: xorl %edx, %edx
; X32-NEXT: retl
;
-; X64-LABEL: cnt64:
+; X64-LABEL: cnt64_noimplicitfloat:
; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shrq %rax
@@ -233,7 +355,7 @@ define i64 @cnt64(i64 %x) nounwind readn
; X64-NEXT: shrq $56, %rax
; X64-NEXT: retq
;
-; X32-POPCNT-LABEL: cnt64:
+; X32-POPCNT-LABEL: cnt64_noimplicitfloat:
; X32-POPCNT: # %bb.0:
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax
@@ -241,7 +363,7 @@ define i64 @cnt64(i64 %x) nounwind readn
; X32-POPCNT-NEXT: xorl %edx, %edx
; X32-POPCNT-NEXT: retl
;
-; X64-POPCNT-LABEL: cnt64:
+; X64-POPCNT-LABEL: cnt64_noimplicitfloat:
; X64-POPCNT: # %bb.0:
; X64-POPCNT-NEXT: popcntq %rdi, %rax
; X64-POPCNT-NEXT: retq
More information about the llvm-commits
mailing list