[llvm] [DAG] Add generic i8 CTPOP lowering using i32 MUL (PR #79989)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 30 04:18:18 PST 2024
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/79989
>From 6e95ebf3a789b735f1b49f2099d99c24d74af359 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 30 Jan 2024 11:50:44 +0000
Subject: [PATCH] [DAG] Add generic i8 CTPOP lowering using i32 MUL
Fixes #79823
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 15 +
.../test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll | 20 +-
llvm/test/CodeGen/PowerPC/popcnt-zext.ll | 44 +--
llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll | 314 ++++++++++++------
llvm/test/CodeGen/X86/ctpop-combine.ll | 21 +-
llvm/test/CodeGen/X86/popcnt.ll | 58 ++--
6 files changed, 288 insertions(+), 184 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index b8ed02e268b18..2eb68485c777a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8639,6 +8639,21 @@ SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const {
if (VT.isVector() && !canExpandVectorCTPOP(*this, VT))
return SDValue();
+ if (VT == MVT::i8 && isOperationLegal(ISD::SRL, MVT::i32) &&
+ isOperationLegal(ISD::MUL, MVT::i32)) {
+ SDValue Mask11 = DAG.getConstant(0x11111111U, dl, MVT::i32);
+ Op = DAG.getZExtOrTrunc(Op, dl, MVT::i32);
+ Op = DAG.getNode(ISD::MUL, dl, MVT::i32, Op,
+ DAG.getConstant(0x08040201U, dl, MVT::i32));
+ Op = DAG.getNode(ISD::SRL, dl, MVT::i32, Op,
+ DAG.getShiftAmountConstant(3, MVT::i32, dl));
+ Op = DAG.getNode(ISD::AND, dl, MVT::i32, Op, Mask11);
+ Op = DAG.getNode(ISD::MUL, dl, MVT::i32, Op, Mask11);
+ Op = DAG.getNode(ISD::SRL, dl, MVT::i32, Op,
+ DAG.getShiftAmountConstant(28, MVT::i32, dl));
+ return DAG.getZExtOrTrunc(Op, dl, MVT::i8);
+ }
+
// This is the "best" algorithm from
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
SDValue Mask55 =
diff --git a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
index fa4fda9b8972b..b60aab1e059d2 100644
--- a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
@@ -169,16 +169,16 @@ define i64 @test_not_ctlz_i64(i64 %a) nounwind {
define i8 @test_ctpop_i8(i8 %a) nounwind {
; LA32-LABEL: test_ctpop_i8:
; LA32: # %bb.0:
-; LA32-NEXT: srli.w $a1, $a0, 1
-; LA32-NEXT: andi $a1, $a1, 85
-; LA32-NEXT: sub.w $a0, $a0, $a1
-; LA32-NEXT: andi $a1, $a0, 51
-; LA32-NEXT: srli.w $a0, $a0, 2
-; LA32-NEXT: andi $a0, $a0, 51
-; LA32-NEXT: add.w $a0, $a1, $a0
-; LA32-NEXT: srli.w $a1, $a0, 4
-; LA32-NEXT: add.w $a0, $a0, $a1
-; LA32-NEXT: andi $a0, $a0, 15
+; LA32-NEXT: lu12i.w $a1, 32832
+; LA32-NEXT: ori $a1, $a1, 513
+; LA32-NEXT: andi $a0, $a0, 255
+; LA32-NEXT: mul.w $a0, $a0, $a1
+; LA32-NEXT: srli.w $a0, $a0, 3
+; LA32-NEXT: lu12i.w $a1, 69905
+; LA32-NEXT: ori $a1, $a1, 273
+; LA32-NEXT: and $a0, $a0, $a1
+; LA32-NEXT: mul.w $a0, $a0, $a1
+; LA32-NEXT: srli.w $a0, $a0, 28
; LA32-NEXT: ret
;
; LA64-LABEL: test_ctpop_i8:
diff --git a/llvm/test/CodeGen/PowerPC/popcnt-zext.ll b/llvm/test/CodeGen/PowerPC/popcnt-zext.ll
index fccf671e4c197..48438ad0139a1 100644
--- a/llvm/test/CodeGen/PowerPC/popcnt-zext.ll
+++ b/llvm/test/CodeGen/PowerPC/popcnt-zext.ll
@@ -41,16 +41,18 @@ define i16 @popz_i8_i16(i8 %x) {
;
; SLOW-LABEL: popz_i8_i16:
; SLOW: # %bb.0:
-; SLOW-NEXT: rotlwi 4, 3, 31
-; SLOW-NEXT: andi. 4, 4, 85
-; SLOW-NEXT: sub 3, 3, 4
-; SLOW-NEXT: rlwinm 4, 3, 30, 30, 31
-; SLOW-NEXT: rlwimi 4, 3, 30, 26, 27
-; SLOW-NEXT: andi. 3, 3, 51
-; SLOW-NEXT: add 3, 3, 4
-; SLOW-NEXT: srwi 4, 3, 4
-; SLOW-NEXT: add 3, 3, 4
-; SLOW-NEXT: clrlwi 3, 3, 28
+; SLOW-NEXT: lis 4, 2052
+; SLOW-NEXT: clrlwi 3, 3, 24
+; SLOW-NEXT: ori 4, 4, 513
+; SLOW-NEXT: mullw 3, 3, 4
+; SLOW-NEXT: rotlwi 3, 3, 29
+; SLOW-NEXT: andis. 4, 3, 4369
+; SLOW-NEXT: andi. 3, 3, 4369
+; SLOW-NEXT: or 3, 3, 4
+; SLOW-NEXT: lis 4, 4369
+; SLOW-NEXT: ori 4, 4, 4369
+; SLOW-NEXT: mullw 3, 3, 4
+; SLOW-NEXT: rlwinm 3, 3, 4, 28, 31
; SLOW-NEXT: blr
%pop = tail call i8 @llvm.ctpop.i8(i8 %x)
%z = zext i8 %pop to i16
@@ -102,16 +104,18 @@ define i32 @popz_i8_32(i8 %x) {
;
; SLOW-LABEL: popz_i8_32:
; SLOW: # %bb.0:
-; SLOW-NEXT: rotlwi 4, 3, 31
-; SLOW-NEXT: andi. 4, 4, 85
-; SLOW-NEXT: sub 3, 3, 4
-; SLOW-NEXT: rlwinm 4, 3, 30, 30, 31
-; SLOW-NEXT: rlwimi 4, 3, 30, 26, 27
-; SLOW-NEXT: andi. 3, 3, 51
-; SLOW-NEXT: add 3, 3, 4
-; SLOW-NEXT: srwi 4, 3, 4
-; SLOW-NEXT: add 3, 3, 4
-; SLOW-NEXT: clrlwi 3, 3, 28
+; SLOW-NEXT: lis 4, 2052
+; SLOW-NEXT: clrlwi 3, 3, 24
+; SLOW-NEXT: ori 4, 4, 513
+; SLOW-NEXT: mullw 3, 3, 4
+; SLOW-NEXT: rotlwi 3, 3, 29
+; SLOW-NEXT: andis. 4, 3, 4369
+; SLOW-NEXT: andi. 3, 3, 4369
+; SLOW-NEXT: or 3, 3, 4
+; SLOW-NEXT: lis 4, 4369
+; SLOW-NEXT: ori 4, 4, 4369
+; SLOW-NEXT: mullw 3, 3, 4
+; SLOW-NEXT: rlwinm 3, 3, 4, 28, 31
; SLOW-NEXT: blr
%pop = tail call i8 @llvm.ctpop.i8(i8 %x)
%z = zext i8 %pop to i32
diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index 455e6e54c9b39..2d26dacbe0edc 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -30,28 +30,28 @@ declare i32 @llvm.ctpop.i32(i32)
declare i64 @llvm.ctpop.i64(i64)
define i8 @test_cttz_i8(i8 %a) nounwind {
-; RV32_NOZBB-LABEL: test_cttz_i8:
-; RV32_NOZBB: # %bb.0:
-; RV32_NOZBB-NEXT: andi a1, a0, 255
-; RV32_NOZBB-NEXT: beqz a1, .LBB0_2
-; RV32_NOZBB-NEXT: # %bb.1: # %cond.false
-; RV32_NOZBB-NEXT: addi a1, a0, -1
-; RV32_NOZBB-NEXT: not a0, a0
-; RV32_NOZBB-NEXT: and a0, a0, a1
-; RV32_NOZBB-NEXT: srli a1, a0, 1
-; RV32_NOZBB-NEXT: andi a1, a1, 85
-; RV32_NOZBB-NEXT: sub a0, a0, a1
-; RV32_NOZBB-NEXT: andi a1, a0, 51
-; RV32_NOZBB-NEXT: srli a0, a0, 2
-; RV32_NOZBB-NEXT: andi a0, a0, 51
-; RV32_NOZBB-NEXT: add a0, a1, a0
-; RV32_NOZBB-NEXT: srli a1, a0, 4
-; RV32_NOZBB-NEXT: add a0, a0, a1
-; RV32_NOZBB-NEXT: andi a0, a0, 15
-; RV32_NOZBB-NEXT: ret
-; RV32_NOZBB-NEXT: .LBB0_2:
-; RV32_NOZBB-NEXT: li a0, 8
-; RV32_NOZBB-NEXT: ret
+; RV32I-LABEL: test_cttz_i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a1, a0, 255
+; RV32I-NEXT: beqz a1, .LBB0_2
+; RV32I-NEXT: # %bb.1: # %cond.false
+; RV32I-NEXT: addi a1, a0, -1
+; RV32I-NEXT: not a0, a0
+; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: srli a1, a0, 1
+; RV32I-NEXT: andi a1, a1, 85
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: andi a1, a0, 51
+; RV32I-NEXT: srli a0, a0, 2
+; RV32I-NEXT: andi a0, a0, 51
+; RV32I-NEXT: add a0, a1, a0
+; RV32I-NEXT: srli a1, a0, 4
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: andi a0, a0, 15
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB0_2:
+; RV32I-NEXT: li a0, 8
+; RV32I-NEXT: ret
;
; RV64NOZBB-LABEL: test_cttz_i8:
; RV64NOZBB: # %bb.0:
@@ -76,6 +76,29 @@ define i8 @test_cttz_i8(i8 %a) nounwind {
; RV64NOZBB-NEXT: li a0, 8
; RV64NOZBB-NEXT: ret
;
+; RV32M-LABEL: test_cttz_i8:
+; RV32M: # %bb.0:
+; RV32M-NEXT: andi a1, a0, 255
+; RV32M-NEXT: beqz a1, .LBB0_2
+; RV32M-NEXT: # %bb.1: # %cond.false
+; RV32M-NEXT: addi a1, a0, -1
+; RV32M-NEXT: not a0, a0
+; RV32M-NEXT: and a0, a0, a1
+; RV32M-NEXT: andi a0, a0, 255
+; RV32M-NEXT: lui a1, 32832
+; RV32M-NEXT: addi a1, a1, 513
+; RV32M-NEXT: mul a0, a0, a1
+; RV32M-NEXT: srli a0, a0, 3
+; RV32M-NEXT: lui a1, 69905
+; RV32M-NEXT: addi a1, a1, 273
+; RV32M-NEXT: and a0, a0, a1
+; RV32M-NEXT: mul a0, a0, a1
+; RV32M-NEXT: srli a0, a0, 28
+; RV32M-NEXT: ret
+; RV32M-NEXT: .LBB0_2:
+; RV32M-NEXT: li a0, 8
+; RV32M-NEXT: ret
+;
; RV32ZBB-LABEL: test_cttz_i8:
; RV32ZBB: # %bb.0:
; RV32ZBB-NEXT: ori a0, a0, 256
@@ -548,22 +571,22 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
}
define i8 @test_cttz_i8_zero_undef(i8 %a) nounwind {
-; RV32_NOZBB-LABEL: test_cttz_i8_zero_undef:
-; RV32_NOZBB: # %bb.0:
-; RV32_NOZBB-NEXT: addi a1, a0, -1
-; RV32_NOZBB-NEXT: not a0, a0
-; RV32_NOZBB-NEXT: and a0, a0, a1
-; RV32_NOZBB-NEXT: srli a1, a0, 1
-; RV32_NOZBB-NEXT: andi a1, a1, 85
-; RV32_NOZBB-NEXT: sub a0, a0, a1
-; RV32_NOZBB-NEXT: andi a1, a0, 51
-; RV32_NOZBB-NEXT: srli a0, a0, 2
-; RV32_NOZBB-NEXT: andi a0, a0, 51
-; RV32_NOZBB-NEXT: add a0, a1, a0
-; RV32_NOZBB-NEXT: srli a1, a0, 4
-; RV32_NOZBB-NEXT: add a0, a0, a1
-; RV32_NOZBB-NEXT: andi a0, a0, 15
-; RV32_NOZBB-NEXT: ret
+; RV32I-LABEL: test_cttz_i8_zero_undef:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi a1, a0, -1
+; RV32I-NEXT: not a0, a0
+; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: srli a1, a0, 1
+; RV32I-NEXT: andi a1, a1, 85
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: andi a1, a0, 51
+; RV32I-NEXT: srli a0, a0, 2
+; RV32I-NEXT: andi a0, a0, 51
+; RV32I-NEXT: add a0, a1, a0
+; RV32I-NEXT: srli a1, a0, 4
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: andi a0, a0, 15
+; RV32I-NEXT: ret
;
; RV64NOZBB-LABEL: test_cttz_i8_zero_undef:
; RV64NOZBB: # %bb.0:
@@ -582,6 +605,23 @@ define i8 @test_cttz_i8_zero_undef(i8 %a) nounwind {
; RV64NOZBB-NEXT: andi a0, a0, 15
; RV64NOZBB-NEXT: ret
;
+; RV32M-LABEL: test_cttz_i8_zero_undef:
+; RV32M: # %bb.0:
+; RV32M-NEXT: addi a1, a0, -1
+; RV32M-NEXT: not a0, a0
+; RV32M-NEXT: and a0, a0, a1
+; RV32M-NEXT: andi a0, a0, 255
+; RV32M-NEXT: lui a1, 32832
+; RV32M-NEXT: addi a1, a1, 513
+; RV32M-NEXT: mul a0, a0, a1
+; RV32M-NEXT: srli a0, a0, 3
+; RV32M-NEXT: lui a1, 69905
+; RV32M-NEXT: addi a1, a1, 273
+; RV32M-NEXT: and a0, a0, a1
+; RV32M-NEXT: mul a0, a0, a1
+; RV32M-NEXT: srli a0, a0, 28
+; RV32M-NEXT: ret
+;
; RV32ZBB-LABEL: test_cttz_i8_zero_undef:
; RV32ZBB: # %bb.0:
; RV32ZBB-NEXT: ctz a0, a0
@@ -954,35 +994,35 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind {
}
define i8 @test_ctlz_i8(i8 %a) nounwind {
-; RV32_NOZBB-LABEL: test_ctlz_i8:
-; RV32_NOZBB: # %bb.0:
-; RV32_NOZBB-NEXT: andi a1, a0, 255
-; RV32_NOZBB-NEXT: beqz a1, .LBB8_2
-; RV32_NOZBB-NEXT: # %bb.1: # %cond.false
-; RV32_NOZBB-NEXT: slli a1, a0, 24
-; RV32_NOZBB-NEXT: srli a1, a1, 25
-; RV32_NOZBB-NEXT: or a0, a0, a1
-; RV32_NOZBB-NEXT: slli a1, a0, 24
-; RV32_NOZBB-NEXT: srli a1, a1, 26
-; RV32_NOZBB-NEXT: or a0, a0, a1
-; RV32_NOZBB-NEXT: slli a1, a0, 24
-; RV32_NOZBB-NEXT: srli a1, a1, 28
-; RV32_NOZBB-NEXT: or a0, a0, a1
-; RV32_NOZBB-NEXT: not a0, a0
-; RV32_NOZBB-NEXT: srli a1, a0, 1
-; RV32_NOZBB-NEXT: andi a1, a1, 85
-; RV32_NOZBB-NEXT: sub a0, a0, a1
-; RV32_NOZBB-NEXT: andi a1, a0, 51
-; RV32_NOZBB-NEXT: srli a0, a0, 2
-; RV32_NOZBB-NEXT: andi a0, a0, 51
-; RV32_NOZBB-NEXT: add a0, a1, a0
-; RV32_NOZBB-NEXT: srli a1, a0, 4
-; RV32_NOZBB-NEXT: add a0, a0, a1
-; RV32_NOZBB-NEXT: andi a0, a0, 15
-; RV32_NOZBB-NEXT: ret
-; RV32_NOZBB-NEXT: .LBB8_2:
-; RV32_NOZBB-NEXT: li a0, 8
-; RV32_NOZBB-NEXT: ret
+; RV32I-LABEL: test_ctlz_i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: andi a1, a0, 255
+; RV32I-NEXT: beqz a1, .LBB8_2
+; RV32I-NEXT: # %bb.1: # %cond.false
+; RV32I-NEXT: slli a1, a0, 24
+; RV32I-NEXT: srli a1, a1, 25
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: slli a1, a0, 24
+; RV32I-NEXT: srli a1, a1, 26
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: slli a1, a0, 24
+; RV32I-NEXT: srli a1, a1, 28
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: not a0, a0
+; RV32I-NEXT: srli a1, a0, 1
+; RV32I-NEXT: andi a1, a1, 85
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: andi a1, a0, 51
+; RV32I-NEXT: srli a0, a0, 2
+; RV32I-NEXT: andi a0, a0, 51
+; RV32I-NEXT: add a0, a1, a0
+; RV32I-NEXT: srli a1, a0, 4
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: andi a0, a0, 15
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB8_2:
+; RV32I-NEXT: li a0, 8
+; RV32I-NEXT: ret
;
; RV64NOZBB-LABEL: test_ctlz_i8:
; RV64NOZBB: # %bb.0:
@@ -1014,6 +1054,36 @@ define i8 @test_ctlz_i8(i8 %a) nounwind {
; RV64NOZBB-NEXT: li a0, 8
; RV64NOZBB-NEXT: ret
;
+; RV32M-LABEL: test_ctlz_i8:
+; RV32M: # %bb.0:
+; RV32M-NEXT: andi a1, a0, 255
+; RV32M-NEXT: beqz a1, .LBB8_2
+; RV32M-NEXT: # %bb.1: # %cond.false
+; RV32M-NEXT: slli a1, a0, 24
+; RV32M-NEXT: srli a1, a1, 25
+; RV32M-NEXT: or a0, a0, a1
+; RV32M-NEXT: slli a1, a0, 24
+; RV32M-NEXT: srli a1, a1, 26
+; RV32M-NEXT: or a0, a0, a1
+; RV32M-NEXT: slli a1, a0, 24
+; RV32M-NEXT: srli a1, a1, 28
+; RV32M-NEXT: or a0, a0, a1
+; RV32M-NEXT: not a0, a0
+; RV32M-NEXT: andi a0, a0, 255
+; RV32M-NEXT: lui a1, 32832
+; RV32M-NEXT: addi a1, a1, 513
+; RV32M-NEXT: mul a0, a0, a1
+; RV32M-NEXT: srli a0, a0, 3
+; RV32M-NEXT: lui a1, 69905
+; RV32M-NEXT: addi a1, a1, 273
+; RV32M-NEXT: and a0, a0, a1
+; RV32M-NEXT: mul a0, a0, a1
+; RV32M-NEXT: srli a0, a0, 28
+; RV32M-NEXT: ret
+; RV32M-NEXT: .LBB8_2:
+; RV32M-NEXT: li a0, 8
+; RV32M-NEXT: ret
+;
; RV32ZBB-LABEL: test_ctlz_i8:
; RV32ZBB: # %bb.0:
; RV32ZBB-NEXT: andi a0, a0, 255
@@ -1649,29 +1719,29 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
}
define i8 @test_ctlz_i8_zero_undef(i8 %a) nounwind {
-; RV32_NOZBB-LABEL: test_ctlz_i8_zero_undef:
-; RV32_NOZBB: # %bb.0:
-; RV32_NOZBB-NEXT: slli a1, a0, 24
-; RV32_NOZBB-NEXT: srli a1, a1, 25
-; RV32_NOZBB-NEXT: or a0, a0, a1
-; RV32_NOZBB-NEXT: slli a1, a0, 24
-; RV32_NOZBB-NEXT: srli a1, a1, 26
-; RV32_NOZBB-NEXT: or a0, a0, a1
-; RV32_NOZBB-NEXT: slli a1, a0, 24
-; RV32_NOZBB-NEXT: srli a1, a1, 28
-; RV32_NOZBB-NEXT: or a0, a0, a1
-; RV32_NOZBB-NEXT: not a0, a0
-; RV32_NOZBB-NEXT: srli a1, a0, 1
-; RV32_NOZBB-NEXT: andi a1, a1, 85
-; RV32_NOZBB-NEXT: sub a0, a0, a1
-; RV32_NOZBB-NEXT: andi a1, a0, 51
-; RV32_NOZBB-NEXT: srli a0, a0, 2
-; RV32_NOZBB-NEXT: andi a0, a0, 51
-; RV32_NOZBB-NEXT: add a0, a1, a0
-; RV32_NOZBB-NEXT: srli a1, a0, 4
-; RV32_NOZBB-NEXT: add a0, a0, a1
-; RV32_NOZBB-NEXT: andi a0, a0, 15
-; RV32_NOZBB-NEXT: ret
+; RV32I-LABEL: test_ctlz_i8_zero_undef:
+; RV32I: # %bb.0:
+; RV32I-NEXT: slli a1, a0, 24
+; RV32I-NEXT: srli a1, a1, 25
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: slli a1, a0, 24
+; RV32I-NEXT: srli a1, a1, 26
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: slli a1, a0, 24
+; RV32I-NEXT: srli a1, a1, 28
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: not a0, a0
+; RV32I-NEXT: srli a1, a0, 1
+; RV32I-NEXT: andi a1, a1, 85
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: andi a1, a0, 51
+; RV32I-NEXT: srli a0, a0, 2
+; RV32I-NEXT: andi a0, a0, 51
+; RV32I-NEXT: add a0, a1, a0
+; RV32I-NEXT: srli a1, a0, 4
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: andi a0, a0, 15
+; RV32I-NEXT: ret
;
; RV64NOZBB-LABEL: test_ctlz_i8_zero_undef:
; RV64NOZBB: # %bb.0:
@@ -1697,6 +1767,30 @@ define i8 @test_ctlz_i8_zero_undef(i8 %a) nounwind {
; RV64NOZBB-NEXT: andi a0, a0, 15
; RV64NOZBB-NEXT: ret
;
+; RV32M-LABEL: test_ctlz_i8_zero_undef:
+; RV32M: # %bb.0:
+; RV32M-NEXT: slli a1, a0, 24
+; RV32M-NEXT: srli a1, a1, 25
+; RV32M-NEXT: or a0, a0, a1
+; RV32M-NEXT: slli a1, a0, 24
+; RV32M-NEXT: srli a1, a1, 26
+; RV32M-NEXT: or a0, a0, a1
+; RV32M-NEXT: slli a1, a0, 24
+; RV32M-NEXT: srli a1, a1, 28
+; RV32M-NEXT: or a0, a0, a1
+; RV32M-NEXT: not a0, a0
+; RV32M-NEXT: andi a0, a0, 255
+; RV32M-NEXT: lui a1, 32832
+; RV32M-NEXT: addi a1, a1, 513
+; RV32M-NEXT: mul a0, a0, a1
+; RV32M-NEXT: srli a0, a0, 3
+; RV32M-NEXT: lui a1, 69905
+; RV32M-NEXT: addi a1, a1, 273
+; RV32M-NEXT: and a0, a0, a1
+; RV32M-NEXT: mul a0, a0, a1
+; RV32M-NEXT: srli a0, a0, 28
+; RV32M-NEXT: ret
+;
; RV32ZBB-LABEL: test_ctlz_i8_zero_undef:
; RV32ZBB: # %bb.0:
; RV32ZBB-NEXT: andi a0, a0, 255
@@ -2290,19 +2384,19 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
}
define i8 @test_ctpop_i8(i8 %a) nounwind {
-; RV32_NOZBB-LABEL: test_ctpop_i8:
-; RV32_NOZBB: # %bb.0:
-; RV32_NOZBB-NEXT: srli a1, a0, 1
-; RV32_NOZBB-NEXT: andi a1, a1, 85
-; RV32_NOZBB-NEXT: sub a0, a0, a1
-; RV32_NOZBB-NEXT: andi a1, a0, 51
-; RV32_NOZBB-NEXT: srli a0, a0, 2
-; RV32_NOZBB-NEXT: andi a0, a0, 51
-; RV32_NOZBB-NEXT: add a0, a1, a0
-; RV32_NOZBB-NEXT: srli a1, a0, 4
-; RV32_NOZBB-NEXT: add a0, a0, a1
-; RV32_NOZBB-NEXT: andi a0, a0, 15
-; RV32_NOZBB-NEXT: ret
+; RV32I-LABEL: test_ctpop_i8:
+; RV32I: # %bb.0:
+; RV32I-NEXT: srli a1, a0, 1
+; RV32I-NEXT: andi a1, a1, 85
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: andi a1, a0, 51
+; RV32I-NEXT: srli a0, a0, 2
+; RV32I-NEXT: andi a0, a0, 51
+; RV32I-NEXT: add a0, a1, a0
+; RV32I-NEXT: srli a1, a0, 4
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: andi a0, a0, 15
+; RV32I-NEXT: ret
;
; RV64NOZBB-LABEL: test_ctpop_i8:
; RV64NOZBB: # %bb.0:
@@ -2318,6 +2412,20 @@ define i8 @test_ctpop_i8(i8 %a) nounwind {
; RV64NOZBB-NEXT: andi a0, a0, 15
; RV64NOZBB-NEXT: ret
;
+; RV32M-LABEL: test_ctpop_i8:
+; RV32M: # %bb.0:
+; RV32M-NEXT: andi a0, a0, 255
+; RV32M-NEXT: lui a1, 32832
+; RV32M-NEXT: addi a1, a1, 513
+; RV32M-NEXT: mul a0, a0, a1
+; RV32M-NEXT: srli a0, a0, 3
+; RV32M-NEXT: lui a1, 69905
+; RV32M-NEXT: addi a1, a1, 273
+; RV32M-NEXT: and a0, a0, a1
+; RV32M-NEXT: mul a0, a0, a1
+; RV32M-NEXT: srli a0, a0, 28
+; RV32M-NEXT: ret
+;
; RV32ZBB-LABEL: test_ctpop_i8:
; RV32ZBB: # %bb.0:
; RV32ZBB-NEXT: andi a0, a0, 255
diff --git a/llvm/test/CodeGen/X86/ctpop-combine.ll b/llvm/test/CodeGen/X86/ctpop-combine.ll
index fba44218e0572..73152e9f909cf 100644
--- a/llvm/test/CodeGen/X86/ctpop-combine.ll
+++ b/llvm/test/CodeGen/X86/ctpop-combine.ll
@@ -88,20 +88,13 @@ define i8 @test4(i8 %x) nounwind readnone {
;
; NO-POPCOUNT-LABEL: test4:
; NO-POPCOUNT: # %bb.0:
-; NO-POPCOUNT-NEXT: movl %edi, %ecx
-; NO-POPCOUNT-NEXT: andb $127, %cl
-; NO-POPCOUNT-NEXT: shrb %dil
-; NO-POPCOUNT-NEXT: andb $21, %dil
-; NO-POPCOUNT-NEXT: subb %dil, %cl
-; NO-POPCOUNT-NEXT: movl %ecx, %eax
-; NO-POPCOUNT-NEXT: andb $51, %al
-; NO-POPCOUNT-NEXT: shrb $2, %cl
-; NO-POPCOUNT-NEXT: andb $51, %cl
-; NO-POPCOUNT-NEXT: addb %al, %cl
-; NO-POPCOUNT-NEXT: movl %ecx, %eax
-; NO-POPCOUNT-NEXT: shrb $4, %al
-; NO-POPCOUNT-NEXT: addb %cl, %al
-; NO-POPCOUNT-NEXT: andb $15, %al
+; NO-POPCOUNT-NEXT: andl $127, %edi
+; NO-POPCOUNT-NEXT: imull $134480385, %edi, %eax # imm = 0x8040201
+; NO-POPCOUNT-NEXT: shrl $3, %eax
+; NO-POPCOUNT-NEXT: andl $286331153, %eax # imm = 0x11111111
+; NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111
+; NO-POPCOUNT-NEXT: shrl $28, %eax
+; NO-POPCOUNT-NEXT: # kill: def $al killed $al killed $eax
; NO-POPCOUNT-NEXT: retq
%x2 = and i8 %x, 127
%count = tail call i8 @llvm.ctpop.i8(i8 %x2)
diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
index a9d77fd2c0a61..c8d060dfee182 100644
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -10,37 +10,24 @@
define i8 @cnt8(i8 %x) nounwind readnone {
; X86-LABEL: cnt8:
; X86: # %bb.0:
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb %al
-; X86-NEXT: andb $85, %al
-; X86-NEXT: subb %al, %cl
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: andb $51, %al
-; X86-NEXT: shrb $2, %cl
-; X86-NEXT: andb $51, %cl
-; X86-NEXT: addb %al, %cl
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $4, %al
-; X86-NEXT: addb %cl, %al
-; X86-NEXT: andb $15, %al
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: andl $286331153, %eax # imm = 0x11111111
+; X86-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111
+; X86-NEXT: shrl $28, %eax
+; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
;
; X64-LABEL: cnt8:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shrb %al
-; X64-NEXT: andb $85, %al
-; X64-NEXT: subb %al, %dil
-; X64-NEXT: movl %edi, %ecx
-; X64-NEXT: andb $51, %cl
-; X64-NEXT: shrb $2, %dil
-; X64-NEXT: andb $51, %dil
-; X64-NEXT: addb %dil, %cl
-; X64-NEXT: movl %ecx, %eax
-; X64-NEXT: shrb $4, %al
-; X64-NEXT: addb %cl, %al
-; X64-NEXT: andb $15, %al
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201
+; X64-NEXT: shrl $3, %eax
+; X64-NEXT: andl $286331153, %eax # imm = 0x11111111
+; X64-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111
+; X64-NEXT: shrl $28, %eax
+; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
;
; X86-POPCNT-LABEL: cnt8:
@@ -59,16 +46,13 @@ define i8 @cnt8(i8 %x) nounwind readnone {
;
; X64-NDD-LABEL: cnt8:
; X64-NDD: # %bb.0:
-; X64-NDD-NEXT: shrb %dil, %al
-; X64-NDD-NEXT: andb $85, %al
-; X64-NDD-NEXT: subb %al, %dil, %al
-; X64-NDD-NEXT: andb $51, %al, %cl
-; X64-NDD-NEXT: shrb $2, %al
-; X64-NDD-NEXT: andb $51, %al
-; X64-NDD-NEXT: addb %cl, %al
-; X64-NDD-NEXT: shrb $4, %al, %cl
-; X64-NDD-NEXT: addb %cl, %al
-; X64-NDD-NEXT: andb $15, %al
+; X64-NDD-NEXT: movzbl %dil, %eax
+; X64-NDD-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201
+; X64-NDD-NEXT: shrl $3, %eax
+; X64-NDD-NEXT: andl $286331153, %eax # imm = 0x11111111
+; X64-NDD-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111
+; X64-NDD-NEXT: shrl $28, %eax
+; X64-NDD-NEXT: # kill: def $al killed $al killed $eax
; X64-NDD-NEXT: retq
%cnt = tail call i8 @llvm.ctpop.i8(i8 %x)
ret i8 %cnt
More information about the llvm-commits
mailing list