[llvm] [DAG] Add generic i8 CTPOP lowering using i32 MUL (PR #79989)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Tue Jan 30 04:18:18 PST 2024


https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/79989

>From 6e95ebf3a789b735f1b49f2099d99c24d74af359 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 30 Jan 2024 11:50:44 +0000
Subject: [PATCH] [DAG] Add generic i8 CTPOP lowering using i32 MUL

Fixes #79823
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  15 +
 .../test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll |  20 +-
 llvm/test/CodeGen/PowerPC/popcnt-zext.ll      |  44 +--
 llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll    | 314 ++++++++++++------
 llvm/test/CodeGen/X86/ctpop-combine.ll        |  21 +-
 llvm/test/CodeGen/X86/popcnt.ll               |  58 ++--
 6 files changed, 288 insertions(+), 184 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index b8ed02e268b18..2eb68485c777a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8639,6 +8639,21 @@ SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const {
   if (VT.isVector() && !canExpandVectorCTPOP(*this, VT))
     return SDValue();
 
+  if (VT == MVT::i8 && isOperationLegal(ISD::SRL, MVT::i32) &&
+      isOperationLegal(ISD::MUL, MVT::i32)) {
+    SDValue Mask11 = DAG.getConstant(0x11111111U, dl, MVT::i32);
+    Op = DAG.getZExtOrTrunc(Op, dl, MVT::i32);
+    Op = DAG.getNode(ISD::MUL, dl, MVT::i32, Op,
+                     DAG.getConstant(0x08040201U, dl, MVT::i32));
+    Op = DAG.getNode(ISD::SRL, dl, MVT::i32, Op,
+                     DAG.getShiftAmountConstant(3, MVT::i32, dl));
+    Op = DAG.getNode(ISD::AND, dl, MVT::i32, Op, Mask11);
+    Op = DAG.getNode(ISD::MUL, dl, MVT::i32, Op, Mask11);
+    Op = DAG.getNode(ISD::SRL, dl, MVT::i32, Op,
+                     DAG.getShiftAmountConstant(28, MVT::i32, dl));
+    return DAG.getZExtOrTrunc(Op, dl, MVT::i8);
+  }
+
   // This is the "best" algorithm from
   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
   SDValue Mask55 =
diff --git a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
index fa4fda9b8972b..b60aab1e059d2 100644
--- a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
@@ -169,16 +169,16 @@ define i64 @test_not_ctlz_i64(i64 %a) nounwind {
 define i8 @test_ctpop_i8(i8 %a) nounwind {
 ; LA32-LABEL: test_ctpop_i8:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    srli.w $a1, $a0, 1
-; LA32-NEXT:    andi $a1, $a1, 85
-; LA32-NEXT:    sub.w $a0, $a0, $a1
-; LA32-NEXT:    andi $a1, $a0, 51
-; LA32-NEXT:    srli.w $a0, $a0, 2
-; LA32-NEXT:    andi $a0, $a0, 51
-; LA32-NEXT:    add.w $a0, $a1, $a0
-; LA32-NEXT:    srli.w $a1, $a0, 4
-; LA32-NEXT:    add.w $a0, $a0, $a1
-; LA32-NEXT:    andi $a0, $a0, 15
+; LA32-NEXT:    lu12i.w $a1, 32832
+; LA32-NEXT:    ori $a1, $a1, 513
+; LA32-NEXT:    andi $a0, $a0, 255
+; LA32-NEXT:    mul.w $a0, $a0, $a1
+; LA32-NEXT:    srli.w $a0, $a0, 3
+; LA32-NEXT:    lu12i.w $a1, 69905
+; LA32-NEXT:    ori $a1, $a1, 273
+; LA32-NEXT:    and $a0, $a0, $a1
+; LA32-NEXT:    mul.w $a0, $a0, $a1
+; LA32-NEXT:    srli.w $a0, $a0, 28
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test_ctpop_i8:
diff --git a/llvm/test/CodeGen/PowerPC/popcnt-zext.ll b/llvm/test/CodeGen/PowerPC/popcnt-zext.ll
index fccf671e4c197..48438ad0139a1 100644
--- a/llvm/test/CodeGen/PowerPC/popcnt-zext.ll
+++ b/llvm/test/CodeGen/PowerPC/popcnt-zext.ll
@@ -41,16 +41,18 @@ define i16 @popz_i8_i16(i8 %x) {
 ;
 ; SLOW-LABEL: popz_i8_i16:
 ; SLOW:       # %bb.0:
-; SLOW-NEXT:    rotlwi 4, 3, 31
-; SLOW-NEXT:    andi. 4, 4, 85
-; SLOW-NEXT:    sub 3, 3, 4
-; SLOW-NEXT:    rlwinm 4, 3, 30, 30, 31
-; SLOW-NEXT:    rlwimi 4, 3, 30, 26, 27
-; SLOW-NEXT:    andi. 3, 3, 51
-; SLOW-NEXT:    add 3, 3, 4
-; SLOW-NEXT:    srwi 4, 3, 4
-; SLOW-NEXT:    add 3, 3, 4
-; SLOW-NEXT:    clrlwi 3, 3, 28
+; SLOW-NEXT:    lis 4, 2052
+; SLOW-NEXT:    clrlwi 3, 3, 24
+; SLOW-NEXT:    ori 4, 4, 513
+; SLOW-NEXT:    mullw 3, 3, 4
+; SLOW-NEXT:    rotlwi 3, 3, 29
+; SLOW-NEXT:    andis. 4, 3, 4369
+; SLOW-NEXT:    andi. 3, 3, 4369
+; SLOW-NEXT:    or 3, 3, 4
+; SLOW-NEXT:    lis 4, 4369
+; SLOW-NEXT:    ori 4, 4, 4369
+; SLOW-NEXT:    mullw 3, 3, 4
+; SLOW-NEXT:    rlwinm 3, 3, 4, 28, 31
 ; SLOW-NEXT:    blr
   %pop = tail call i8 @llvm.ctpop.i8(i8 %x)
   %z = zext i8 %pop to i16
@@ -102,16 +104,18 @@ define i32 @popz_i8_32(i8 %x) {
 ;
 ; SLOW-LABEL: popz_i8_32:
 ; SLOW:       # %bb.0:
-; SLOW-NEXT:    rotlwi 4, 3, 31
-; SLOW-NEXT:    andi. 4, 4, 85
-; SLOW-NEXT:    sub 3, 3, 4
-; SLOW-NEXT:    rlwinm 4, 3, 30, 30, 31
-; SLOW-NEXT:    rlwimi 4, 3, 30, 26, 27
-; SLOW-NEXT:    andi. 3, 3, 51
-; SLOW-NEXT:    add 3, 3, 4
-; SLOW-NEXT:    srwi 4, 3, 4
-; SLOW-NEXT:    add 3, 3, 4
-; SLOW-NEXT:    clrlwi 3, 3, 28
+; SLOW-NEXT:    lis 4, 2052
+; SLOW-NEXT:    clrlwi 3, 3, 24
+; SLOW-NEXT:    ori 4, 4, 513
+; SLOW-NEXT:    mullw 3, 3, 4
+; SLOW-NEXT:    rotlwi 3, 3, 29
+; SLOW-NEXT:    andis. 4, 3, 4369
+; SLOW-NEXT:    andi. 3, 3, 4369
+; SLOW-NEXT:    or 3, 3, 4
+; SLOW-NEXT:    lis 4, 4369
+; SLOW-NEXT:    ori 4, 4, 4369
+; SLOW-NEXT:    mullw 3, 3, 4
+; SLOW-NEXT:    rlwinm 3, 3, 4, 28, 31
 ; SLOW-NEXT:    blr
   %pop = tail call i8 @llvm.ctpop.i8(i8 %x)
   %z = zext i8 %pop to i32
diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index 455e6e54c9b39..2d26dacbe0edc 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -30,28 +30,28 @@ declare i32 @llvm.ctpop.i32(i32)
 declare i64 @llvm.ctpop.i64(i64)
 
 define i8 @test_cttz_i8(i8 %a) nounwind {
-; RV32_NOZBB-LABEL: test_cttz_i8:
-; RV32_NOZBB:       # %bb.0:
-; RV32_NOZBB-NEXT:    andi a1, a0, 255
-; RV32_NOZBB-NEXT:    beqz a1, .LBB0_2
-; RV32_NOZBB-NEXT:  # %bb.1: # %cond.false
-; RV32_NOZBB-NEXT:    addi a1, a0, -1
-; RV32_NOZBB-NEXT:    not a0, a0
-; RV32_NOZBB-NEXT:    and a0, a0, a1
-; RV32_NOZBB-NEXT:    srli a1, a0, 1
-; RV32_NOZBB-NEXT:    andi a1, a1, 85
-; RV32_NOZBB-NEXT:    sub a0, a0, a1
-; RV32_NOZBB-NEXT:    andi a1, a0, 51
-; RV32_NOZBB-NEXT:    srli a0, a0, 2
-; RV32_NOZBB-NEXT:    andi a0, a0, 51
-; RV32_NOZBB-NEXT:    add a0, a1, a0
-; RV32_NOZBB-NEXT:    srli a1, a0, 4
-; RV32_NOZBB-NEXT:    add a0, a0, a1
-; RV32_NOZBB-NEXT:    andi a0, a0, 15
-; RV32_NOZBB-NEXT:    ret
-; RV32_NOZBB-NEXT:  .LBB0_2:
-; RV32_NOZBB-NEXT:    li a0, 8
-; RV32_NOZBB-NEXT:    ret
+; RV32I-LABEL: test_cttz_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a1, a0, 255
+; RV32I-NEXT:    beqz a1, .LBB0_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    addi a1, a0, -1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    andi a1, a1, 85
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 51
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    andi a0, a0, 51
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a0, a0, 15
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB0_2:
+; RV32I-NEXT:    li a0, 8
+; RV32I-NEXT:    ret
 ;
 ; RV64NOZBB-LABEL: test_cttz_i8:
 ; RV64NOZBB:       # %bb.0:
@@ -76,6 +76,29 @@ define i8 @test_cttz_i8(i8 %a) nounwind {
 ; RV64NOZBB-NEXT:    li a0, 8
 ; RV64NOZBB-NEXT:    ret
 ;
+; RV32M-LABEL: test_cttz_i8:
+; RV32M:       # %bb.0:
+; RV32M-NEXT:    andi a1, a0, 255
+; RV32M-NEXT:    beqz a1, .LBB0_2
+; RV32M-NEXT:  # %bb.1: # %cond.false
+; RV32M-NEXT:    addi a1, a0, -1
+; RV32M-NEXT:    not a0, a0
+; RV32M-NEXT:    and a0, a0, a1
+; RV32M-NEXT:    andi a0, a0, 255
+; RV32M-NEXT:    lui a1, 32832
+; RV32M-NEXT:    addi a1, a1, 513
+; RV32M-NEXT:    mul a0, a0, a1
+; RV32M-NEXT:    srli a0, a0, 3
+; RV32M-NEXT:    lui a1, 69905
+; RV32M-NEXT:    addi a1, a1, 273
+; RV32M-NEXT:    and a0, a0, a1
+; RV32M-NEXT:    mul a0, a0, a1
+; RV32M-NEXT:    srli a0, a0, 28
+; RV32M-NEXT:    ret
+; RV32M-NEXT:  .LBB0_2:
+; RV32M-NEXT:    li a0, 8
+; RV32M-NEXT:    ret
+;
 ; RV32ZBB-LABEL: test_cttz_i8:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    ori a0, a0, 256
@@ -548,22 +571,22 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
 }
 
 define i8 @test_cttz_i8_zero_undef(i8 %a) nounwind {
-; RV32_NOZBB-LABEL: test_cttz_i8_zero_undef:
-; RV32_NOZBB:       # %bb.0:
-; RV32_NOZBB-NEXT:    addi a1, a0, -1
-; RV32_NOZBB-NEXT:    not a0, a0
-; RV32_NOZBB-NEXT:    and a0, a0, a1
-; RV32_NOZBB-NEXT:    srli a1, a0, 1
-; RV32_NOZBB-NEXT:    andi a1, a1, 85
-; RV32_NOZBB-NEXT:    sub a0, a0, a1
-; RV32_NOZBB-NEXT:    andi a1, a0, 51
-; RV32_NOZBB-NEXT:    srli a0, a0, 2
-; RV32_NOZBB-NEXT:    andi a0, a0, 51
-; RV32_NOZBB-NEXT:    add a0, a1, a0
-; RV32_NOZBB-NEXT:    srli a1, a0, 4
-; RV32_NOZBB-NEXT:    add a0, a0, a1
-; RV32_NOZBB-NEXT:    andi a0, a0, 15
-; RV32_NOZBB-NEXT:    ret
+; RV32I-LABEL: test_cttz_i8_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi a1, a0, -1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    andi a1, a1, 85
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 51
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    andi a0, a0, 51
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a0, a0, 15
+; RV32I-NEXT:    ret
 ;
 ; RV64NOZBB-LABEL: test_cttz_i8_zero_undef:
 ; RV64NOZBB:       # %bb.0:
@@ -582,6 +605,23 @@ define i8 @test_cttz_i8_zero_undef(i8 %a) nounwind {
 ; RV64NOZBB-NEXT:    andi a0, a0, 15
 ; RV64NOZBB-NEXT:    ret
 ;
+; RV32M-LABEL: test_cttz_i8_zero_undef:
+; RV32M:       # %bb.0:
+; RV32M-NEXT:    addi a1, a0, -1
+; RV32M-NEXT:    not a0, a0
+; RV32M-NEXT:    and a0, a0, a1
+; RV32M-NEXT:    andi a0, a0, 255
+; RV32M-NEXT:    lui a1, 32832
+; RV32M-NEXT:    addi a1, a1, 513
+; RV32M-NEXT:    mul a0, a0, a1
+; RV32M-NEXT:    srli a0, a0, 3
+; RV32M-NEXT:    lui a1, 69905
+; RV32M-NEXT:    addi a1, a1, 273
+; RV32M-NEXT:    and a0, a0, a1
+; RV32M-NEXT:    mul a0, a0, a1
+; RV32M-NEXT:    srli a0, a0, 28
+; RV32M-NEXT:    ret
+;
 ; RV32ZBB-LABEL: test_cttz_i8_zero_undef:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    ctz a0, a0
@@ -954,35 +994,35 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind {
 }
 
 define i8 @test_ctlz_i8(i8 %a) nounwind {
-; RV32_NOZBB-LABEL: test_ctlz_i8:
-; RV32_NOZBB:       # %bb.0:
-; RV32_NOZBB-NEXT:    andi a1, a0, 255
-; RV32_NOZBB-NEXT:    beqz a1, .LBB8_2
-; RV32_NOZBB-NEXT:  # %bb.1: # %cond.false
-; RV32_NOZBB-NEXT:    slli a1, a0, 24
-; RV32_NOZBB-NEXT:    srli a1, a1, 25
-; RV32_NOZBB-NEXT:    or a0, a0, a1
-; RV32_NOZBB-NEXT:    slli a1, a0, 24
-; RV32_NOZBB-NEXT:    srli a1, a1, 26
-; RV32_NOZBB-NEXT:    or a0, a0, a1
-; RV32_NOZBB-NEXT:    slli a1, a0, 24
-; RV32_NOZBB-NEXT:    srli a1, a1, 28
-; RV32_NOZBB-NEXT:    or a0, a0, a1
-; RV32_NOZBB-NEXT:    not a0, a0
-; RV32_NOZBB-NEXT:    srli a1, a0, 1
-; RV32_NOZBB-NEXT:    andi a1, a1, 85
-; RV32_NOZBB-NEXT:    sub a0, a0, a1
-; RV32_NOZBB-NEXT:    andi a1, a0, 51
-; RV32_NOZBB-NEXT:    srli a0, a0, 2
-; RV32_NOZBB-NEXT:    andi a0, a0, 51
-; RV32_NOZBB-NEXT:    add a0, a1, a0
-; RV32_NOZBB-NEXT:    srli a1, a0, 4
-; RV32_NOZBB-NEXT:    add a0, a0, a1
-; RV32_NOZBB-NEXT:    andi a0, a0, 15
-; RV32_NOZBB-NEXT:    ret
-; RV32_NOZBB-NEXT:  .LBB8_2:
-; RV32_NOZBB-NEXT:    li a0, 8
-; RV32_NOZBB-NEXT:    ret
+; RV32I-LABEL: test_ctlz_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a1, a0, 255
+; RV32I-NEXT:    beqz a1, .LBB8_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 25
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 26
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    andi a1, a1, 85
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 51
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    andi a0, a0, 51
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a0, a0, 15
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB8_2:
+; RV32I-NEXT:    li a0, 8
+; RV32I-NEXT:    ret
 ;
 ; RV64NOZBB-LABEL: test_ctlz_i8:
 ; RV64NOZBB:       # %bb.0:
@@ -1014,6 +1054,36 @@ define i8 @test_ctlz_i8(i8 %a) nounwind {
 ; RV64NOZBB-NEXT:    li a0, 8
 ; RV64NOZBB-NEXT:    ret
 ;
+; RV32M-LABEL: test_ctlz_i8:
+; RV32M:       # %bb.0:
+; RV32M-NEXT:    andi a1, a0, 255
+; RV32M-NEXT:    beqz a1, .LBB8_2
+; RV32M-NEXT:  # %bb.1: # %cond.false
+; RV32M-NEXT:    slli a1, a0, 24
+; RV32M-NEXT:    srli a1, a1, 25
+; RV32M-NEXT:    or a0, a0, a1
+; RV32M-NEXT:    slli a1, a0, 24
+; RV32M-NEXT:    srli a1, a1, 26
+; RV32M-NEXT:    or a0, a0, a1
+; RV32M-NEXT:    slli a1, a0, 24
+; RV32M-NEXT:    srli a1, a1, 28
+; RV32M-NEXT:    or a0, a0, a1
+; RV32M-NEXT:    not a0, a0
+; RV32M-NEXT:    andi a0, a0, 255
+; RV32M-NEXT:    lui a1, 32832
+; RV32M-NEXT:    addi a1, a1, 513
+; RV32M-NEXT:    mul a0, a0, a1
+; RV32M-NEXT:    srli a0, a0, 3
+; RV32M-NEXT:    lui a1, 69905
+; RV32M-NEXT:    addi a1, a1, 273
+; RV32M-NEXT:    and a0, a0, a1
+; RV32M-NEXT:    mul a0, a0, a1
+; RV32M-NEXT:    srli a0, a0, 28
+; RV32M-NEXT:    ret
+; RV32M-NEXT:  .LBB8_2:
+; RV32M-NEXT:    li a0, 8
+; RV32M-NEXT:    ret
+;
 ; RV32ZBB-LABEL: test_ctlz_i8:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    andi a0, a0, 255
@@ -1649,29 +1719,29 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 }
 
 define i8 @test_ctlz_i8_zero_undef(i8 %a) nounwind {
-; RV32_NOZBB-LABEL: test_ctlz_i8_zero_undef:
-; RV32_NOZBB:       # %bb.0:
-; RV32_NOZBB-NEXT:    slli a1, a0, 24
-; RV32_NOZBB-NEXT:    srli a1, a1, 25
-; RV32_NOZBB-NEXT:    or a0, a0, a1
-; RV32_NOZBB-NEXT:    slli a1, a0, 24
-; RV32_NOZBB-NEXT:    srli a1, a1, 26
-; RV32_NOZBB-NEXT:    or a0, a0, a1
-; RV32_NOZBB-NEXT:    slli a1, a0, 24
-; RV32_NOZBB-NEXT:    srli a1, a1, 28
-; RV32_NOZBB-NEXT:    or a0, a0, a1
-; RV32_NOZBB-NEXT:    not a0, a0
-; RV32_NOZBB-NEXT:    srli a1, a0, 1
-; RV32_NOZBB-NEXT:    andi a1, a1, 85
-; RV32_NOZBB-NEXT:    sub a0, a0, a1
-; RV32_NOZBB-NEXT:    andi a1, a0, 51
-; RV32_NOZBB-NEXT:    srli a0, a0, 2
-; RV32_NOZBB-NEXT:    andi a0, a0, 51
-; RV32_NOZBB-NEXT:    add a0, a1, a0
-; RV32_NOZBB-NEXT:    srli a1, a0, 4
-; RV32_NOZBB-NEXT:    add a0, a0, a1
-; RV32_NOZBB-NEXT:    andi a0, a0, 15
-; RV32_NOZBB-NEXT:    ret
+; RV32I-LABEL: test_ctlz_i8_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 25
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 26
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    andi a1, a1, 85
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 51
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    andi a0, a0, 51
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a0, a0, 15
+; RV32I-NEXT:    ret
 ;
 ; RV64NOZBB-LABEL: test_ctlz_i8_zero_undef:
 ; RV64NOZBB:       # %bb.0:
@@ -1697,6 +1767,30 @@ define i8 @test_ctlz_i8_zero_undef(i8 %a) nounwind {
 ; RV64NOZBB-NEXT:    andi a0, a0, 15
 ; RV64NOZBB-NEXT:    ret
 ;
+; RV32M-LABEL: test_ctlz_i8_zero_undef:
+; RV32M:       # %bb.0:
+; RV32M-NEXT:    slli a1, a0, 24
+; RV32M-NEXT:    srli a1, a1, 25
+; RV32M-NEXT:    or a0, a0, a1
+; RV32M-NEXT:    slli a1, a0, 24
+; RV32M-NEXT:    srli a1, a1, 26
+; RV32M-NEXT:    or a0, a0, a1
+; RV32M-NEXT:    slli a1, a0, 24
+; RV32M-NEXT:    srli a1, a1, 28
+; RV32M-NEXT:    or a0, a0, a1
+; RV32M-NEXT:    not a0, a0
+; RV32M-NEXT:    andi a0, a0, 255
+; RV32M-NEXT:    lui a1, 32832
+; RV32M-NEXT:    addi a1, a1, 513
+; RV32M-NEXT:    mul a0, a0, a1
+; RV32M-NEXT:    srli a0, a0, 3
+; RV32M-NEXT:    lui a1, 69905
+; RV32M-NEXT:    addi a1, a1, 273
+; RV32M-NEXT:    and a0, a0, a1
+; RV32M-NEXT:    mul a0, a0, a1
+; RV32M-NEXT:    srli a0, a0, 28
+; RV32M-NEXT:    ret
+;
 ; RV32ZBB-LABEL: test_ctlz_i8_zero_undef:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    andi a0, a0, 255
@@ -2290,19 +2384,19 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
 }
 
 define i8 @test_ctpop_i8(i8 %a) nounwind {
-; RV32_NOZBB-LABEL: test_ctpop_i8:
-; RV32_NOZBB:       # %bb.0:
-; RV32_NOZBB-NEXT:    srli a1, a0, 1
-; RV32_NOZBB-NEXT:    andi a1, a1, 85
-; RV32_NOZBB-NEXT:    sub a0, a0, a1
-; RV32_NOZBB-NEXT:    andi a1, a0, 51
-; RV32_NOZBB-NEXT:    srli a0, a0, 2
-; RV32_NOZBB-NEXT:    andi a0, a0, 51
-; RV32_NOZBB-NEXT:    add a0, a1, a0
-; RV32_NOZBB-NEXT:    srli a1, a0, 4
-; RV32_NOZBB-NEXT:    add a0, a0, a1
-; RV32_NOZBB-NEXT:    andi a0, a0, 15
-; RV32_NOZBB-NEXT:    ret
+; RV32I-LABEL: test_ctpop_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    andi a1, a1, 85
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 51
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    andi a0, a0, 51
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a0, a0, 15
+; RV32I-NEXT:    ret
 ;
 ; RV64NOZBB-LABEL: test_ctpop_i8:
 ; RV64NOZBB:       # %bb.0:
@@ -2318,6 +2412,20 @@ define i8 @test_ctpop_i8(i8 %a) nounwind {
 ; RV64NOZBB-NEXT:    andi a0, a0, 15
 ; RV64NOZBB-NEXT:    ret
 ;
+; RV32M-LABEL: test_ctpop_i8:
+; RV32M:       # %bb.0:
+; RV32M-NEXT:    andi a0, a0, 255
+; RV32M-NEXT:    lui a1, 32832
+; RV32M-NEXT:    addi a1, a1, 513
+; RV32M-NEXT:    mul a0, a0, a1
+; RV32M-NEXT:    srli a0, a0, 3
+; RV32M-NEXT:    lui a1, 69905
+; RV32M-NEXT:    addi a1, a1, 273
+; RV32M-NEXT:    and a0, a0, a1
+; RV32M-NEXT:    mul a0, a0, a1
+; RV32M-NEXT:    srli a0, a0, 28
+; RV32M-NEXT:    ret
+;
 ; RV32ZBB-LABEL: test_ctpop_i8:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    andi a0, a0, 255
diff --git a/llvm/test/CodeGen/X86/ctpop-combine.ll b/llvm/test/CodeGen/X86/ctpop-combine.ll
index fba44218e0572..73152e9f909cf 100644
--- a/llvm/test/CodeGen/X86/ctpop-combine.ll
+++ b/llvm/test/CodeGen/X86/ctpop-combine.ll
@@ -88,20 +88,13 @@ define i8 @test4(i8 %x) nounwind readnone {
 ;
 ; NO-POPCOUNT-LABEL: test4:
 ; NO-POPCOUNT:       # %bb.0:
-; NO-POPCOUNT-NEXT:    movl %edi, %ecx
-; NO-POPCOUNT-NEXT:    andb $127, %cl
-; NO-POPCOUNT-NEXT:    shrb %dil
-; NO-POPCOUNT-NEXT:    andb $21, %dil
-; NO-POPCOUNT-NEXT:    subb %dil, %cl
-; NO-POPCOUNT-NEXT:    movl %ecx, %eax
-; NO-POPCOUNT-NEXT:    andb $51, %al
-; NO-POPCOUNT-NEXT:    shrb $2, %cl
-; NO-POPCOUNT-NEXT:    andb $51, %cl
-; NO-POPCOUNT-NEXT:    addb %al, %cl
-; NO-POPCOUNT-NEXT:    movl %ecx, %eax
-; NO-POPCOUNT-NEXT:    shrb $4, %al
-; NO-POPCOUNT-NEXT:    addb %cl, %al
-; NO-POPCOUNT-NEXT:    andb $15, %al
+; NO-POPCOUNT-NEXT:    andl $127, %edi
+; NO-POPCOUNT-NEXT:    imull $134480385, %edi, %eax # imm = 0x8040201
+; NO-POPCOUNT-NEXT:    shrl $3, %eax
+; NO-POPCOUNT-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; NO-POPCOUNT-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; NO-POPCOUNT-NEXT:    shrl $28, %eax
+; NO-POPCOUNT-NEXT:    # kill: def $al killed $al killed $eax
 ; NO-POPCOUNT-NEXT:    retq
   %x2 = and i8 %x, 127
   %count = tail call i8 @llvm.ctpop.i8(i8 %x2)
diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
index a9d77fd2c0a61..c8d060dfee182 100644
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -10,37 +10,24 @@
 define i8 @cnt8(i8 %x) nounwind readnone {
 ; X86-LABEL: cnt8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb %al
-; X86-NEXT:    andb $85, %al
-; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andb $51, %al
-; X86-NEXT:    shrb $2, %cl
-; X86-NEXT:    andb $51, %cl
-; X86-NEXT:    addb %al, %cl
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $4, %al
-; X86-NEXT:    addb %cl, %al
-; X86-NEXT:    andb $15, %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X86-NEXT:    shrl $3, %eax
+; X86-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; X86-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X86-NEXT:    shrl $28, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cnt8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrb %al
-; X64-NEXT:    andb $85, %al
-; X64-NEXT:    subb %al, %dil
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    andb $51, %cl
-; X64-NEXT:    shrb $2, %dil
-; X64-NEXT:    andb $51, %dil
-; X64-NEXT:    addb %dil, %cl
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    shrb $4, %al
-; X64-NEXT:    addb %cl, %al
-; X64-NEXT:    andb $15, %al
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X64-NEXT:    shrl $3, %eax
+; X64-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; X64-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X64-NEXT:    shrl $28, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
 ; X86-POPCNT-LABEL: cnt8:
@@ -59,16 +46,13 @@ define i8 @cnt8(i8 %x) nounwind readnone {
 ;
 ; X64-NDD-LABEL: cnt8:
 ; X64-NDD:       # %bb.0:
-; X64-NDD-NEXT:    shrb %dil, %al
-; X64-NDD-NEXT:    andb $85, %al
-; X64-NDD-NEXT:    subb %al, %dil, %al
-; X64-NDD-NEXT:    andb $51, %al, %cl
-; X64-NDD-NEXT:    shrb $2, %al
-; X64-NDD-NEXT:    andb $51, %al
-; X64-NDD-NEXT:    addb %cl, %al
-; X64-NDD-NEXT:    shrb $4, %al, %cl
-; X64-NDD-NEXT:    addb %cl, %al
-; X64-NDD-NEXT:    andb $15, %al
+; X64-NDD-NEXT:    movzbl %dil, %eax
+; X64-NDD-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X64-NDD-NEXT:    shrl $3, %eax
+; X64-NDD-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; X64-NDD-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X64-NDD-NEXT:    shrl $28, %eax
+; X64-NDD-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NDD-NEXT:    retq
   %cnt = tail call i8 @llvm.ctpop.i8(i8 %x)
   ret i8 %cnt



More information about the llvm-commits mailing list