[llvm] [RISCV] Improve fixed vector handling in isCtpopFast. (PR #158380)

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 12 15:54:47 PDT 2025


https://github.com/topperc created https://github.com/llvm/llvm-project/pull/158380

Previously we considered fixed vectors fast if Zvbb or Zbb is
enabled. Zbb only helps if the vector type will end up being
scalarized.

>From f7f0da9f01cadbbd006d14d63f6b6c2b0dc212f5 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Fri, 12 Sep 2025 14:49:41 -0700
Subject: [PATCH 1/2] Pre-commit test

---
 .../CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll  | 436 ++++++++++++++++--
 1 file changed, 388 insertions(+), 48 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
index 44b9331fd2caf..4ece358eeddd9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
-; RUN: llc -mtriple=riscv32 -mattr=+v,+zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZVBB
-; RUN: llc -mtriple=riscv64 -mattr=+v,+zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZVBB
+; RUN: llc -mtriple=riscv32 -mattr=+zbb,+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+zbb,+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+zbb,+v,+zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZVBB
+; RUN: llc -mtriple=riscv64 -mattr=+zbb,+v,+zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZVBB
 
 define void @ctpop_v16i8(ptr %x, ptr %y) {
 ; CHECK-LABEL: ctpop_v16i8:
@@ -341,9 +341,27 @@ define <8 x i1> @ctpop_v8i32_ult_two(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vadd.vi v10, v8, -1
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
+; CHECK-NEXT:    vand.vx v10, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v10
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 4112
+; CHECK-NEXT:    addi a0, a0, 257
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vmsleu.vi v0, v8, 1
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: ctpop_v8i32_ult_two:
@@ -364,9 +382,27 @@ define <8 x i1> @ctpop_v8i32_ugt_one(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vadd.vi v10, v8, -1
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
+; CHECK-NEXT:    vand.vx v10, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v10
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 4112
+; CHECK-NEXT:    addi a0, a0, 257
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vmsgtu.vi v0, v8, 1
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: ctpop_v8i32_ugt_one:
@@ -387,9 +423,27 @@ define <8 x i1> @ctpop_v8i32_eq_one(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vadd.vi v10, v8, -1
-; CHECK-NEXT:    vxor.vv v8, v8, v10
-; CHECK-NEXT:    vmsltu.vv v0, v10, v8
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
+; CHECK-NEXT:    vand.vx v10, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v10
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 4112
+; CHECK-NEXT:    addi a0, a0, 257
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vmseq.vi v0, v8, 1
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: ctpop_v8i32_eq_one:
@@ -410,9 +464,27 @@ define <8 x i1> @ctpop_v8i32_ne_one(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vadd.vi v10, v8, -1
-; CHECK-NEXT:    vxor.vv v8, v8, v10
-; CHECK-NEXT:    vmsleu.vv v0, v8, v10
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
+; CHECK-NEXT:    vand.vx v10, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 4
+; CHECK-NEXT:    vadd.vv v8, v8, v10
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 4112
+; CHECK-NEXT:    addi a0, a0, 257
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 24
+; CHECK-NEXT:    vmsne.vi v0, v8, 1
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: ctpop_v8i32_ne_one:
@@ -521,14 +593,81 @@ define void @ctpop_v4i64(ptr %x, ptr %y) {
   ret void
 }
 define <4 x i1> @ctpop_v4i64_ult_two(ptr %x, ptr %y) {
-; CHECK-LABEL: ctpop_v4i64_ult_two:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vadd.vi v10, v8, -1
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-NEXT:    ret
+; RV32-LABEL: ctpop_v4i64_ult_two:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vand.vv v10, v12, v10
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vmul.vv v8, v8, v10
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    vmsleu.vi v0, v8, 1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: ctpop_v4i64_ult_two:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsrl.vi v10, v8, 1
+; RV64-NEXT:    vand.vx v10, v10, a0
+; RV64-NEXT:    vsub.vv v8, v8, v10
+; RV64-NEXT:    vand.vx v10, v8, a1
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vadd.vv v8, v10, v8
+; RV64-NEXT:    vsrl.vi v10, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v10
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
+; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vsrl.vx v8, v8, a0
+; RV64-NEXT:    vmsleu.vi v0, v8, 1
+; RV64-NEXT:    ret
 ;
 ; ZVBB-LABEL: ctpop_v4i64_ult_two:
 ; ZVBB:       # %bb.0:
@@ -544,14 +683,81 @@ define <4 x i1> @ctpop_v4i64_ult_two(ptr %x, ptr %y) {
   ret <4 x i1> %cmp
 }
 define <4 x i1> @ctpop_v4i64_ugt_one(ptr %x, ptr %y) {
-; CHECK-LABEL: ctpop_v4i64_ugt_one:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vadd.vi v10, v8, -1
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
+; RV32-LABEL: ctpop_v4i64_ugt_one:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vand.vv v10, v12, v10
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vmul.vv v8, v8, v10
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    vmsgtu.vi v0, v8, 1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: ctpop_v4i64_ugt_one:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsrl.vi v10, v8, 1
+; RV64-NEXT:    vand.vx v10, v10, a0
+; RV64-NEXT:    vsub.vv v8, v8, v10
+; RV64-NEXT:    vand.vx v10, v8, a1
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vadd.vv v8, v10, v8
+; RV64-NEXT:    vsrl.vi v10, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v10
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
+; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vsrl.vx v8, v8, a0
+; RV64-NEXT:    vmsgtu.vi v0, v8, 1
+; RV64-NEXT:    ret
 ;
 ; ZVBB-LABEL: ctpop_v4i64_ugt_one:
 ; ZVBB:       # %bb.0:
@@ -567,14 +773,81 @@ define <4 x i1> @ctpop_v4i64_ugt_one(ptr %x, ptr %y) {
   ret <4 x i1> %cmp
 }
 define <4 x i1> @ctpop_v4i64_eq_one(ptr %x, ptr %y) {
-; CHECK-LABEL: ctpop_v4i64_eq_one:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vadd.vi v10, v8, -1
-; CHECK-NEXT:    vxor.vv v8, v8, v10
-; CHECK-NEXT:    vmsltu.vv v0, v10, v8
-; CHECK-NEXT:    ret
+; RV32-LABEL: ctpop_v4i64_eq_one:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vand.vv v10, v12, v10
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vmul.vv v8, v8, v10
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    vmseq.vi v0, v8, 1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: ctpop_v4i64_eq_one:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsrl.vi v10, v8, 1
+; RV64-NEXT:    vand.vx v10, v10, a0
+; RV64-NEXT:    vsub.vv v8, v8, v10
+; RV64-NEXT:    vand.vx v10, v8, a1
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vadd.vv v8, v10, v8
+; RV64-NEXT:    vsrl.vi v10, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v10
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
+; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vsrl.vx v8, v8, a0
+; RV64-NEXT:    vmseq.vi v0, v8, 1
+; RV64-NEXT:    ret
 ;
 ; ZVBB-LABEL: ctpop_v4i64_eq_one:
 ; ZVBB:       # %bb.0:
@@ -590,14 +863,81 @@ define <4 x i1> @ctpop_v4i64_eq_one(ptr %x, ptr %y) {
   ret <4 x i1> %cmp
 }
 define <4 x i1> @ctpop_v4i64_ne_one(ptr %x, ptr %y) {
-; CHECK-LABEL: ctpop_v4i64_ne_one:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vadd.vi v10, v8, -1
-; CHECK-NEXT:    vxor.vv v8, v8, v10
-; CHECK-NEXT:    vmsleu.vv v0, v8, v10
-; CHECK-NEXT:    ret
+; RV32-LABEL: ctpop_v4i64_ne_one:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vand.vv v10, v12, v10
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    lui a0, 4112
+; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vmul.vv v8, v8, v10
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    vmsne.vi v0, v8, 1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: ctpop_v4i64_ne_one:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addi a0, a0, 1365
+; RV64-NEXT:    addi a1, a1, 819
+; RV64-NEXT:    addi a2, a2, -241
+; RV64-NEXT:    addi a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsrl.vi v10, v8, 1
+; RV64-NEXT:    vand.vx v10, v10, a0
+; RV64-NEXT:    vsub.vv v8, v8, v10
+; RV64-NEXT:    vand.vx v10, v8, a1
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vadd.vv v8, v10, v8
+; RV64-NEXT:    vsrl.vi v10, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v10
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
+; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vsrl.vx v8, v8, a0
+; RV64-NEXT:    vmsne.vi v0, v8, 1
+; RV64-NEXT:    ret
 ;
 ; ZVBB-LABEL: ctpop_v4i64_ne_one:
 ; ZVBB:       # %bb.0:

>From 96b34222845f6aacbf63512004a1ccd5d68f0e60 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Fri, 12 Sep 2025 15:04:43 -0700
Subject: [PATCH 2/2] [RISCV] Improve fixed vector handling in isCtpopFast.

Previously we considered fixed vectors fast if Zvbb or Zbb is
enabled. Zbb only helps if the vector type will end up being
scalarized.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  19 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll  | 428 ++----------------
 .../CodeGenPrepare/unfold-pow2-test-vec.ll    |  20 +-
 3 files changed, 68 insertions(+), 399 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f9b484b98739f..b3c1082184162 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2752,7 +2752,7 @@ bool RISCVTargetLowering::isLegalElementTypeForRVV(EVT ScalarTy) const {
   case MVT::i8:
   case MVT::i16:
   case MVT::i32:
-    return true;
+    return Subtarget.hasVInstructions();
   case MVT::i64:
     return Subtarget.hasVInstructionsI64();
   case MVT::f16:
@@ -24840,12 +24840,17 @@ bool RISCVTargetLowering::areTwoSDNodeTargetMMOFlagsMergeable(
 }
 
 bool RISCVTargetLowering::isCtpopFast(EVT VT) const {
-  if (VT.isScalableVector())
-    return isTypeLegal(VT) && Subtarget.hasStdExtZvbb();
-  if (VT.isFixedLengthVector() && Subtarget.hasStdExtZvbb())
-    return true;
-  return Subtarget.hasCPOPLike() &&
-         (VT == MVT::i32 || VT == MVT::i64 || VT.isFixedLengthVector());
+  if (VT.isVector()) {
+    EVT SVT = VT.getVectorElementType();
+    // If the element type is legal we can use cpop.v if it is enabled.
+    if (isLegalElementTypeForRVV(SVT))
+      return Subtarget.hasStdExtZvbb();
+    // If it will be scalarized, we might be able to use cpop.
+    return VT.isFixedLengthVector() && Subtarget.hasCPOPLike() &&
+           (SVT == MVT::i32 || SVT == MVT::i64);
+  }
+
+  return Subtarget.hasCPOPLike() && (VT == MVT::i32 || VT == MVT::i64);
 }
 
 unsigned RISCVTargetLowering::getCustomCtpopCost(EVT VT,
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
index 4ece358eeddd9..474708383b4c1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
@@ -341,27 +341,9 @@ define <8 x i1> @ctpop_v8i32_ult_two(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
-; CHECK-NEXT:    vmsleu.vi v0, v8, 1
+; CHECK-NEXT:    vadd.vi v10, v8, -1
+; CHECK-NEXT:    vand.vv v8, v8, v10
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: ctpop_v8i32_ult_two:
@@ -382,27 +364,9 @@ define <8 x i1> @ctpop_v8i32_ugt_one(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
-; CHECK-NEXT:    vmsgtu.vi v0, v8, 1
+; CHECK-NEXT:    vadd.vi v10, v8, -1
+; CHECK-NEXT:    vand.vv v8, v8, v10
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: ctpop_v8i32_ugt_one:
@@ -423,27 +387,9 @@ define <8 x i1> @ctpop_v8i32_eq_one(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
-; CHECK-NEXT:    vmseq.vi v0, v8, 1
+; CHECK-NEXT:    vadd.vi v10, v8, -1
+; CHECK-NEXT:    vxor.vv v8, v8, v10
+; CHECK-NEXT:    vmsltu.vv v0, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: ctpop_v8i32_eq_one:
@@ -464,27 +410,9 @@ define <8 x i1> @ctpop_v8i32_ne_one(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsub.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 2
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vadd.vv v8, v10, v8
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmul.vx v8, v8, a0
-; CHECK-NEXT:    vsrl.vi v8, v8, 24
-; CHECK-NEXT:    vmsne.vi v0, v8, 1
+; CHECK-NEXT:    vadd.vi v10, v8, -1
+; CHECK-NEXT:    vxor.vv v8, v8, v10
+; CHECK-NEXT:    vmsleu.vv v0, v8, v10
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: ctpop_v8i32_ne_one:
@@ -593,81 +521,14 @@ define void @ctpop_v4i64(ptr %x, ptr %y) {
   ret void
 }
 define <4 x i1> @ctpop_v4i64_ult_two(ptr %x, ptr %y) {
-; RV32-LABEL: ctpop_v4i64_ult_two:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    vand.vv v10, v12, v10
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a0
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    vand.vv v10, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a0
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vmul.vv v8, v8, v10
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    vmsleu.vi v0, v8, 1
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: ctpop_v4i64_ult_two:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v10, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v10, v8
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vmul.vx v8, v8, a3
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    vmsleu.vi v0, v8, 1
-; RV64-NEXT:    ret
+; CHECK-LABEL: ctpop_v4i64_ult_two:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vadd.vi v10, v8, -1
+; CHECK-NEXT:    vand.vv v8, v8, v10
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: ctpop_v4i64_ult_two:
 ; ZVBB:       # %bb.0:
@@ -683,81 +544,14 @@ define <4 x i1> @ctpop_v4i64_ult_two(ptr %x, ptr %y) {
   ret <4 x i1> %cmp
 }
 define <4 x i1> @ctpop_v4i64_ugt_one(ptr %x, ptr %y) {
-; RV32-LABEL: ctpop_v4i64_ugt_one:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    vand.vv v10, v12, v10
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a0
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    vand.vv v10, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a0
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vmul.vv v8, v8, v10
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    vmsgtu.vi v0, v8, 1
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: ctpop_v4i64_ugt_one:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v10, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v10, v8
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vmul.vx v8, v8, a3
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    vmsgtu.vi v0, v8, 1
-; RV64-NEXT:    ret
+; CHECK-LABEL: ctpop_v4i64_ugt_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vadd.vi v10, v8, -1
+; CHECK-NEXT:    vand.vv v8, v8, v10
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: ctpop_v4i64_ugt_one:
 ; ZVBB:       # %bb.0:
@@ -773,81 +567,14 @@ define <4 x i1> @ctpop_v4i64_ugt_one(ptr %x, ptr %y) {
   ret <4 x i1> %cmp
 }
 define <4 x i1> @ctpop_v4i64_eq_one(ptr %x, ptr %y) {
-; RV32-LABEL: ctpop_v4i64_eq_one:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    vand.vv v10, v12, v10
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a0
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    vand.vv v10, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a0
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vmul.vv v8, v8, v10
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    vmseq.vi v0, v8, 1
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: ctpop_v4i64_eq_one:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v10, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v10, v8
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vmul.vx v8, v8, a3
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    vmseq.vi v0, v8, 1
-; RV64-NEXT:    ret
+; CHECK-LABEL: ctpop_v4i64_eq_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vadd.vi v10, v8, -1
+; CHECK-NEXT:    vxor.vv v8, v8, v10
+; CHECK-NEXT:    vmsltu.vv v0, v10, v8
+; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: ctpop_v4i64_eq_one:
 ; ZVBB:       # %bb.0:
@@ -863,81 +590,14 @@ define <4 x i1> @ctpop_v4i64_eq_one(ptr %x, ptr %y) {
   ret <4 x i1> %cmp
 }
 define <4 x i1> @ctpop_v4i64_ne_one(ptr %x, ptr %y) {
-; RV32-LABEL: ctpop_v4i64_ne_one:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    vand.vv v10, v12, v10
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a0
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    vand.vv v10, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a0
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vmul.vv v8, v8, v10
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    vmsne.vi v0, v8, 1
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: ctpop_v4i64_ne_one:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    lui a2, 61681
-; RV64-NEXT:    lui a3, 4112
-; RV64-NEXT:    addi a0, a0, 1365
-; RV64-NEXT:    addi a1, a1, 819
-; RV64-NEXT:    addi a2, a2, -241
-; RV64-NEXT:    addi a3, a3, 257
-; RV64-NEXT:    slli a4, a0, 32
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    slli a4, a1, 32
-; RV64-NEXT:    add a1, a1, a4
-; RV64-NEXT:    slli a4, a2, 32
-; RV64-NEXT:    add a2, a2, a4
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v10, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v10, v8
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vmul.vx v8, v8, a3
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    vmsne.vi v0, v8, 1
-; RV64-NEXT:    ret
+; CHECK-LABEL: ctpop_v4i64_ne_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vadd.vi v10, v8, -1
+; CHECK-NEXT:    vxor.vv v8, v8, v10
+; CHECK-NEXT:    vmsleu.vv v0, v8, v10
+; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: ctpop_v4i64_ne_one:
 ; ZVBB:       # %bb.0:
diff --git a/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test-vec.ll b/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test-vec.ll
index 9e4a10d9eb864..ed795c223ffaa 100644
--- a/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test-vec.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test-vec.ll
@@ -18,8 +18,9 @@ define <4 x i1> @test_ult_2(<4 x i64> %x) {
 ;
 ; FAST-LABEL: define <4 x i1> @test_ult_2(
 ; FAST-SAME: <4 x i64> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
-; FAST-NEXT:    [[CTPOP:%.*]] = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> [[X]])
-; FAST-NEXT:    [[CMP1:%.*]] = icmp ult <4 x i64> [[CTPOP]], splat (i64 2)
+; FAST-NEXT:    [[TMP1:%.*]] = add <4 x i64> [[X]], splat (i64 -1)
+; FAST-NEXT:    [[TMP2:%.*]] = and <4 x i64> [[X]], [[TMP1]]
+; FAST-NEXT:    [[CMP1:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer
 ; FAST-NEXT:    ret <4 x i1> [[CMP1]]
 ;
   %ctpop = call <4 x i64> @llvm.ctpop(<4 x i64> %x)
@@ -37,8 +38,9 @@ define <4 x i1> @test_ugt_1(<4 x i64> %x) {
 ;
 ; FAST-LABEL: define <4 x i1> @test_ugt_1(
 ; FAST-SAME: <4 x i64> [[X:%.*]]) #[[ATTR0]] {
-; FAST-NEXT:    [[CTPOP:%.*]] = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> [[X]])
-; FAST-NEXT:    [[CMP1:%.*]] = icmp ugt <4 x i64> [[CTPOP]], splat (i64 1)
+; FAST-NEXT:    [[TMP1:%.*]] = add <4 x i64> [[X]], splat (i64 -1)
+; FAST-NEXT:    [[TMP2:%.*]] = and <4 x i64> [[X]], [[TMP1]]
+; FAST-NEXT:    [[CMP1:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer
 ; FAST-NEXT:    ret <4 x i1> [[CMP1]]
 ;
   %ctpop = call <4 x i64> @llvm.ctpop(<4 x i64> %x)
@@ -56,8 +58,9 @@ define <4 x i1> @test_eq_1(<4 x i64> %x) {
 ;
 ; FAST-LABEL: define <4 x i1> @test_eq_1(
 ; FAST-SAME: <4 x i64> [[X:%.*]]) #[[ATTR0]] {
-; FAST-NEXT:    [[CTPOP:%.*]] = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> [[X]])
-; FAST-NEXT:    [[CMP1:%.*]] = icmp eq <4 x i64> [[CTPOP]], splat (i64 1)
+; FAST-NEXT:    [[TMP1:%.*]] = add <4 x i64> [[X]], splat (i64 -1)
+; FAST-NEXT:    [[TMP2:%.*]] = xor <4 x i64> [[X]], [[TMP1]]
+; FAST-NEXT:    [[CMP1:%.*]] = icmp ugt <4 x i64> [[TMP2]], [[TMP1]]
 ; FAST-NEXT:    ret <4 x i1> [[CMP1]]
 ;
   %ctpop = call <4 x i64> @llvm.ctpop(<4 x i64> %x)
@@ -75,8 +78,9 @@ define <4 x i1> @test_ne_1(<4 x i64> %x) {
 ;
 ; FAST-LABEL: define <4 x i1> @test_ne_1(
 ; FAST-SAME: <4 x i64> [[X:%.*]]) #[[ATTR0]] {
-; FAST-NEXT:    [[CTPOP:%.*]] = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> [[X]])
-; FAST-NEXT:    [[CMP1:%.*]] = icmp ne <4 x i64> [[CTPOP]], splat (i64 1)
+; FAST-NEXT:    [[TMP1:%.*]] = add <4 x i64> [[X]], splat (i64 -1)
+; FAST-NEXT:    [[TMP2:%.*]] = xor <4 x i64> [[X]], [[TMP1]]
+; FAST-NEXT:    [[CMP1:%.*]] = icmp ule <4 x i64> [[TMP2]], [[TMP1]]
 ; FAST-NEXT:    ret <4 x i1> [[CMP1]]
 ;
   %ctpop = call <4 x i64> @llvm.ctpop(<4 x i64> %x)



More information about the llvm-commits mailing list