[llvm] 1c4880a - [TargetLowering] Expand the last stage of i16 popcnt using shift+add+and instead of mul+shift.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Mon May 16 09:38:24 PDT 2022
Author: Craig Topper
Date: 2022-05-16T09:27:44-07:00
New Revision: 1c4880a2d39fbd95edced0dd97c34a9f53bf62ff
URL: https://github.com/llvm/llvm-project/commit/1c4880a2d39fbd95edced0dd97c34a9f53bf62ff
DIFF: https://github.com/llvm/llvm-project/commit/1c4880a2d39fbd95edced0dd97c34a9f53bf62ff.diff
LOG: [TargetLowering] Expand the last stage of i16 popcnt using shift+add+and instead of mul+shift.
If we use multiply it would be with 0x0101 which is 1 more than a power
of 2. On some targets we would expand this to shl+add. By avoiding the
multiply earlier, we can generate better code.
Note, PowerPC doesn't do the shl+add expansion of multiply so one of
the tests increased in instruction count.
Limiting to scalars because it almost always increased the number of
instructions in vector tests.
Reviewed By: RKSimon
Differential Revision: https://reviews.llvm.org/D125638
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/test/CodeGen/PowerPC/popcnt-zext.ll
llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
llvm/test/CodeGen/X86/parity-vec.ll
llvm/test/CodeGen/X86/popcnt.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4eeec85bbd976..72d2216d69289 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7716,6 +7716,18 @@ SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const {
if (Len <= 8)
return Op;
+ // Avoid the multiply if we only have 2 bytes to add.
+ // TODO: Only doing this for scalars because vectors weren't as obviously
+ // improved.
+ if (Len == 16 && !VT.isVector()) {
+ // v = (v + (v >> 8)) & 0x00FF;
+ return DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(ISD::ADD, dl, VT, Op,
+ DAG.getNode(ISD::SRL, dl, VT, Op,
+ DAG.getConstant(8, dl, ShVT))),
+ DAG.getConstant(0xFF, dl, VT));
+ }
+
// v = (v * 0x01010101...) >> (Len - 8)
SDValue Mask01 =
DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT);
diff --git a/llvm/test/CodeGen/PowerPC/popcnt-zext.ll b/llvm/test/CodeGen/PowerPC/popcnt-zext.ll
index 21f902f510e97..fccf671e4c197 100644
--- a/llvm/test/CodeGen/PowerPC/popcnt-zext.ll
+++ b/llvm/test/CodeGen/PowerPC/popcnt-zext.ll
@@ -23,9 +23,9 @@ define i16 @zpop_i8_i16(i8 %x) {
; SLOW-NEXT: add 3, 4, 3
; SLOW-NEXT: srwi 4, 3, 4
; SLOW-NEXT: add 3, 3, 4
-; SLOW-NEXT: andi. 3, 3, 3855
-; SLOW-NEXT: mulli 3, 3, 257
-; SLOW-NEXT: rlwinm 3, 3, 24, 24, 31
+; SLOW-NEXT: rlwinm 4, 3, 24, 28, 31
+; SLOW-NEXT: clrlwi 3, 3, 28
+; SLOW-NEXT: add 3, 3, 4
; SLOW-NEXT: blr
%z = zext i8 %x to i16
%pop = tail call i16 @llvm.ctpop.i16(i16 %z)
@@ -172,9 +172,10 @@ define i32 @popz_i16_32(i16 %x) {
; SLOW-NEXT: add 3, 4, 3
; SLOW-NEXT: srwi 4, 3, 4
; SLOW-NEXT: add 3, 3, 4
-; SLOW-NEXT: andi. 3, 3, 3855
-; SLOW-NEXT: mulli 3, 3, 257
-; SLOW-NEXT: rlwinm 3, 3, 24, 24, 31
+; SLOW-NEXT: rlwinm 4, 3, 24, 28, 31
+; SLOW-NEXT: clrlwi 3, 3, 28
+; SLOW-NEXT: add 3, 3, 4
+; SLOW-NEXT: clrldi 3, 3, 32
; SLOW-NEXT: blr
%pop = tail call i16 @llvm.ctpop.i16(i16 %x)
%z = zext i16 %pop to i32
@@ -276,9 +277,9 @@ define i64 @popa_i16_i64(i16 %x) {
; SLOW-NEXT: add 3, 4, 3
; SLOW-NEXT: srwi 4, 3, 4
; SLOW-NEXT: add 3, 3, 4
-; SLOW-NEXT: andi. 3, 3, 3855
-; SLOW-NEXT: mulli 3, 3, 257
-; SLOW-NEXT: srwi 3, 3, 8
+; SLOW-NEXT: rlwinm 4, 3, 24, 28, 31
+; SLOW-NEXT: clrlwi 3, 3, 28
+; SLOW-NEXT: add 3, 3, 4
; SLOW-NEXT: rlwinm 3, 3, 0, 27, 27
; SLOW-NEXT: blr
%pop = call i16 @llvm.ctpop.i16(i16 %x)
diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index d58505d43d4bc..5b96398fb5b27 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -110,13 +110,10 @@ define i16 @test_cttz_i16(i16 %a) nounwind {
; RV32_NOZBB-NEXT: add a0, a2, a0
; RV32_NOZBB-NEXT: srli a1, a0, 4
; RV32_NOZBB-NEXT: add a0, a0, a1
-; RV32_NOZBB-NEXT: lui a1, 1
-; RV32_NOZBB-NEXT: addi a1, a1, -241
-; RV32_NOZBB-NEXT: and a0, a0, a1
-; RV32_NOZBB-NEXT: slli a1, a0, 8
+; RV32_NOZBB-NEXT: andi a1, a0, 15
+; RV32_NOZBB-NEXT: slli a0, a0, 20
+; RV32_NOZBB-NEXT: srli a0, a0, 28
; RV32_NOZBB-NEXT: add a0, a1, a0
-; RV32_NOZBB-NEXT: slli a0, a0, 19
-; RV32_NOZBB-NEXT: srli a0, a0, 27
; RV32_NOZBB-NEXT: ret
; RV32_NOZBB-NEXT: .LBB1_2:
; RV32_NOZBB-NEXT: li a0, 16
@@ -143,14 +140,11 @@ define i16 @test_cttz_i16(i16 %a) nounwind {
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: add a0, a2, a0
; RV64NOZBB-NEXT: srli a1, a0, 4
-; RV64NOZBB-NEXT: add a0, a0, a1
-; RV64NOZBB-NEXT: lui a1, 1
-; RV64NOZBB-NEXT: addiw a1, a1, -241
-; RV64NOZBB-NEXT: and a0, a0, a1
-; RV64NOZBB-NEXT: slliw a1, a0, 8
-; RV64NOZBB-NEXT: addw a0, a1, a0
-; RV64NOZBB-NEXT: slli a0, a0, 51
-; RV64NOZBB-NEXT: srli a0, a0, 59
+; RV64NOZBB-NEXT: addw a0, a0, a1
+; RV64NOZBB-NEXT: andi a1, a0, 15
+; RV64NOZBB-NEXT: slli a0, a0, 52
+; RV64NOZBB-NEXT: srli a0, a0, 60
+; RV64NOZBB-NEXT: add a0, a1, a0
; RV64NOZBB-NEXT: ret
; RV64NOZBB-NEXT: .LBB1_2:
; RV64NOZBB-NEXT: li a0, 16
@@ -606,13 +600,10 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind {
; RV32_NOZBB-NEXT: add a0, a2, a0
; RV32_NOZBB-NEXT: srli a1, a0, 4
; RV32_NOZBB-NEXT: add a0, a0, a1
-; RV32_NOZBB-NEXT: lui a1, 1
-; RV32_NOZBB-NEXT: addi a1, a1, -241
-; RV32_NOZBB-NEXT: and a0, a0, a1
-; RV32_NOZBB-NEXT: slli a1, a0, 8
+; RV32_NOZBB-NEXT: andi a1, a0, 15
+; RV32_NOZBB-NEXT: slli a0, a0, 20
+; RV32_NOZBB-NEXT: srli a0, a0, 28
; RV32_NOZBB-NEXT: add a0, a1, a0
-; RV32_NOZBB-NEXT: slli a0, a0, 19
-; RV32_NOZBB-NEXT: srli a0, a0, 27
; RV32_NOZBB-NEXT: ret
;
; RV64NOZBB-LABEL: test_cttz_i16_zero_undef:
@@ -632,14 +623,11 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind {
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: add a0, a2, a0
; RV64NOZBB-NEXT: srli a1, a0, 4
-; RV64NOZBB-NEXT: add a0, a0, a1
-; RV64NOZBB-NEXT: lui a1, 1
-; RV64NOZBB-NEXT: addiw a1, a1, -241
-; RV64NOZBB-NEXT: and a0, a0, a1
-; RV64NOZBB-NEXT: slliw a1, a0, 8
-; RV64NOZBB-NEXT: addw a0, a1, a0
-; RV64NOZBB-NEXT: slli a0, a0, 51
-; RV64NOZBB-NEXT: srli a0, a0, 59
+; RV64NOZBB-NEXT: addw a0, a0, a1
+; RV64NOZBB-NEXT: andi a1, a0, 15
+; RV64NOZBB-NEXT: slli a0, a0, 52
+; RV64NOZBB-NEXT: srli a0, a0, 60
+; RV64NOZBB-NEXT: add a0, a1, a0
; RV64NOZBB-NEXT: ret
;
; RV32ZBB-LABEL: test_cttz_i16_zero_undef:
@@ -1096,13 +1084,10 @@ define i16 @test_ctlz_i16(i16 %a) nounwind {
; RV32_NOZBB-NEXT: add a0, a2, a0
; RV32_NOZBB-NEXT: srli a1, a0, 4
; RV32_NOZBB-NEXT: add a0, a0, a1
-; RV32_NOZBB-NEXT: lui a1, 1
-; RV32_NOZBB-NEXT: addi a1, a1, -241
-; RV32_NOZBB-NEXT: and a0, a0, a1
-; RV32_NOZBB-NEXT: slli a1, a0, 8
+; RV32_NOZBB-NEXT: andi a1, a0, 15
+; RV32_NOZBB-NEXT: slli a0, a0, 20
+; RV32_NOZBB-NEXT: srli a0, a0, 28
; RV32_NOZBB-NEXT: add a0, a1, a0
-; RV32_NOZBB-NEXT: slli a0, a0, 19
-; RV32_NOZBB-NEXT: srli a0, a0, 27
; RV32_NOZBB-NEXT: ret
; RV32_NOZBB-NEXT: .LBB9_2:
; RV32_NOZBB-NEXT: li a0, 16
@@ -1138,14 +1123,11 @@ define i16 @test_ctlz_i16(i16 %a) nounwind {
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: add a0, a2, a0
; RV64NOZBB-NEXT: srli a1, a0, 4
-; RV64NOZBB-NEXT: add a0, a0, a1
-; RV64NOZBB-NEXT: lui a1, 1
-; RV64NOZBB-NEXT: addiw a1, a1, -241
-; RV64NOZBB-NEXT: and a0, a0, a1
-; RV64NOZBB-NEXT: slliw a1, a0, 8
-; RV64NOZBB-NEXT: addw a0, a1, a0
-; RV64NOZBB-NEXT: slli a0, a0, 51
-; RV64NOZBB-NEXT: srli a0, a0, 59
+; RV64NOZBB-NEXT: addw a0, a0, a1
+; RV64NOZBB-NEXT: andi a1, a0, 15
+; RV64NOZBB-NEXT: slli a0, a0, 52
+; RV64NOZBB-NEXT: srli a0, a0, 60
+; RV64NOZBB-NEXT: add a0, a1, a0
; RV64NOZBB-NEXT: ret
; RV64NOZBB-NEXT: .LBB9_2:
; RV64NOZBB-NEXT: li a0, 16
@@ -1713,13 +1695,10 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
; RV32_NOZBB-NEXT: add a0, a2, a0
; RV32_NOZBB-NEXT: srli a1, a0, 4
; RV32_NOZBB-NEXT: add a0, a0, a1
-; RV32_NOZBB-NEXT: lui a1, 1
-; RV32_NOZBB-NEXT: addi a1, a1, -241
-; RV32_NOZBB-NEXT: and a0, a0, a1
-; RV32_NOZBB-NEXT: slli a1, a0, 8
+; RV32_NOZBB-NEXT: andi a1, a0, 15
+; RV32_NOZBB-NEXT: slli a0, a0, 20
+; RV32_NOZBB-NEXT: srli a0, a0, 28
; RV32_NOZBB-NEXT: add a0, a1, a0
-; RV32_NOZBB-NEXT: slli a0, a0, 19
-; RV32_NOZBB-NEXT: srli a0, a0, 27
; RV32_NOZBB-NEXT: ret
;
; RV64NOZBB-LABEL: test_ctlz_i16_zero_undef:
@@ -1749,14 +1728,11 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: add a0, a2, a0
; RV64NOZBB-NEXT: srli a1, a0, 4
-; RV64NOZBB-NEXT: add a0, a0, a1
-; RV64NOZBB-NEXT: lui a1, 1
-; RV64NOZBB-NEXT: addiw a1, a1, -241
-; RV64NOZBB-NEXT: and a0, a0, a1
-; RV64NOZBB-NEXT: slliw a1, a0, 8
-; RV64NOZBB-NEXT: addw a0, a1, a0
-; RV64NOZBB-NEXT: slli a0, a0, 51
-; RV64NOZBB-NEXT: srli a0, a0, 59
+; RV64NOZBB-NEXT: addw a0, a0, a1
+; RV64NOZBB-NEXT: andi a1, a0, 15
+; RV64NOZBB-NEXT: slli a0, a0, 52
+; RV64NOZBB-NEXT: srli a0, a0, 60
+; RV64NOZBB-NEXT: add a0, a1, a0
; RV64NOZBB-NEXT: ret
;
; RV32ZBB-LABEL: test_ctlz_i16_zero_undef:
@@ -2251,13 +2227,10 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
; RV32_NOZBB-NEXT: add a0, a2, a0
; RV32_NOZBB-NEXT: srli a1, a0, 4
; RV32_NOZBB-NEXT: add a0, a0, a1
-; RV32_NOZBB-NEXT: lui a1, 1
-; RV32_NOZBB-NEXT: addi a1, a1, -241
-; RV32_NOZBB-NEXT: and a0, a0, a1
-; RV32_NOZBB-NEXT: slli a1, a0, 8
+; RV32_NOZBB-NEXT: andi a1, a0, 15
+; RV32_NOZBB-NEXT: slli a0, a0, 20
+; RV32_NOZBB-NEXT: srli a0, a0, 28
; RV32_NOZBB-NEXT: add a0, a1, a0
-; RV32_NOZBB-NEXT: slli a0, a0, 19
-; RV32_NOZBB-NEXT: srli a0, a0, 27
; RV32_NOZBB-NEXT: ret
;
; RV64NOZBB-LABEL: test_ctpop_i16:
@@ -2274,14 +2247,11 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: add a0, a2, a0
; RV64NOZBB-NEXT: srli a1, a0, 4
-; RV64NOZBB-NEXT: add a0, a0, a1
-; RV64NOZBB-NEXT: lui a1, 1
-; RV64NOZBB-NEXT: addiw a1, a1, -241
-; RV64NOZBB-NEXT: and a0, a0, a1
-; RV64NOZBB-NEXT: slliw a1, a0, 8
-; RV64NOZBB-NEXT: addw a0, a1, a0
-; RV64NOZBB-NEXT: slli a0, a0, 51
-; RV64NOZBB-NEXT: srli a0, a0, 59
+; RV64NOZBB-NEXT: addw a0, a0, a1
+; RV64NOZBB-NEXT: andi a1, a0, 15
+; RV64NOZBB-NEXT: slli a0, a0, 52
+; RV64NOZBB-NEXT: srli a0, a0, 60
+; RV64NOZBB-NEXT: add a0, a1, a0
; RV64NOZBB-NEXT: ret
;
; RV32ZBB-LABEL: test_ctpop_i16:
diff --git a/llvm/test/CodeGen/X86/parity-vec.ll b/llvm/test/CodeGen/X86/parity-vec.ll
index ed64bb5eddf49..f9a2411465141 100644
--- a/llvm/test/CodeGen/X86/parity-vec.ll
+++ b/llvm/test/CodeGen/X86/parity-vec.ll
@@ -64,9 +64,8 @@ define i1 @canonical_parity_noncanonical_pred(<16 x i1> %x) {
; NOPOPCNT-NEXT: addl %eax, %ecx
; NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F
; NOPOPCNT-NEXT: movl %ecx, %eax
-; NOPOPCNT-NEXT: shll $8, %eax
-; NOPOPCNT-NEXT: addl %ecx, %eax
; NOPOPCNT-NEXT: shrl $8, %eax
+; NOPOPCNT-NEXT: addl %ecx, %eax
; NOPOPCNT-NEXT: # kill: def $al killed $al killed $eax
; NOPOPCNT-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
index 010a615eef755..de1cb22fd402b 100644
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -77,9 +77,9 @@ define i16 @cnt16(i16 %x) nounwind readnone {
; X86-NEXT: addl %eax, %ecx
; X86-NEXT: andl $3855, %ecx # imm = 0xF0F
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shll $8, %eax
+; X86-NEXT: shrl $8, %eax
; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movzbl %ah, %eax
+; X86-NEXT: movzbl %al, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -99,9 +99,9 @@ define i16 @cnt16(i16 %x) nounwind readnone {
; X64-NEXT: addl %edi, %eax
; X64-NEXT: andl $3855, %eax # imm = 0xF0F
; X64-NEXT: movl %eax, %ecx
-; X64-NEXT: shll $8, %ecx
+; X64-NEXT: shrl $8, %ecx
; X64-NEXT: addl %eax, %ecx
-; X64-NEXT: movzbl %ch, %eax
+; X64-NEXT: movzbl %cl, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
;
@@ -1540,9 +1540,9 @@ define i32 @popcount_i16_zext(i16 zeroext %x) {
; X86-NEXT: addl %eax, %ecx
; X86-NEXT: andl $3855, %ecx # imm = 0xF0F
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shll $8, %eax
+; X86-NEXT: shrl $8, %eax
; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movzbl %ah, %eax
+; X86-NEXT: movzbl %al, %eax
; X86-NEXT: retl
;
; X64-LABEL: popcount_i16_zext:
@@ -1561,9 +1561,9 @@ define i32 @popcount_i16_zext(i16 zeroext %x) {
; X64-NEXT: addl %edi, %eax
; X64-NEXT: andl $3855, %eax # imm = 0xF0F
; X64-NEXT: movl %eax, %ecx
-; X64-NEXT: shll $8, %ecx
+; X64-NEXT: shrl $8, %ecx
; X64-NEXT: addl %eax, %ecx
-; X64-NEXT: movzbl %ch, %eax
+; X64-NEXT: movzbl %cl, %eax
; X64-NEXT: retq
;
; X86-POPCNT-LABEL: popcount_i16_zext:
More information about the llvm-commits
mailing list