[llvm] [LLVM][CodeGen][AArch64] Improve lowering of boolean vector popcount operations. (PR #166401)
Paul Walker via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 4 09:54:47 PST 2025
https://github.com/paulwalker-arm updated https://github.com/llvm/llvm-project/pull/166401
>From 8a463bd1f03cfeae456d613934cdb2b539937b27 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Tue, 4 Nov 2025 14:42:33 +0000
Subject: [PATCH 1/3] Add tests showing codegen for vector mask popcount idiom.
---
llvm/test/CodeGen/AArch64/popcount_vmask.ll | 387 ++++++++++++++++++++
1 file changed, 387 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/popcount_vmask.ll
diff --git a/llvm/test/CodeGen/AArch64/popcount_vmask.ll b/llvm/test/CodeGen/AArch64/popcount_vmask.ll
new file mode 100644
index 0000000000000..ae8a6667fdc74
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/popcount_vmask.ll
@@ -0,0 +1,387 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define i32 @vmask_popcount_i32_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI0_0
+; CHECK-NEXT: cmgt v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: addv b0, v0.8b
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: cnt v0.8b, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %mask = icmp slt <8 x i8> %a, %b
+ %t1 = bitcast <8 x i1> %mask to i8
+ %t2 = call i8 @llvm.ctpop(i8 %t1)
+ %t3 = zext i8 %t2 to i32
+ ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI1_0
+; CHECK-NEXT: cmgt v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: zip1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: cnt v0.8b, v0.8b
+; CHECK-NEXT: addv b0, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %mask = icmp slt <16 x i8> %a, %b
+ %t1 = bitcast <16 x i1> %mask to i16
+ %t2 = call i16 @llvm.ctpop(i16 %t1)
+ %t3 = zext i16 %t2 to i32
+ ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI2_0
+; CHECK-NEXT: cmgt v0.4h, v1.4h, v0.4h
+; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI2_0]
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: addv h0, v0.4h
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and w8, w8, #0xf
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: cnt v0.8b, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %mask = icmp slt <4 x i16> %a, %b
+ %t1 = bitcast <4 x i1> %mask to i4
+ %t2 = call i4 @llvm.ctpop(i4 %t1)
+ %t3 = zext i4 %t2 to i32
+ ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI3_0
+; CHECK-NEXT: cmgt v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and w8, w8, #0xff
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: cnt v0.8b, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %mask = icmp slt <8 x i16> %a, %b
+ %t1 = bitcast <8 x i1> %mask to i8
+ %t2 = call i8 @llvm.ctpop(i8 %t1)
+ %t3 = zext i8 %t2 to i32
+ ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v2i32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI4_0
+; CHECK-NEXT: cmgt v0.2s, v1.2s, v0.2s
+; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_0]
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and w8, w8, #0x3
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: cnt v0.8b, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %mask = icmp slt <2 x i32> %a, %b
+ %t1 = bitcast <2 x i1> %mask to i2
+ %t2 = call i2 @llvm.ctpop(i2 %t1)
+ %t3 = zext i2 %t2 to i32
+ ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI5_0
+; CHECK-NEXT: cmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0]
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: addv s0, v0.4s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and w8, w8, #0xf
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: cnt v0.8b, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %mask = icmp slt <4 x i32> %a, %b
+ %t1 = bitcast <4 x i1> %mask to i4
+ %t2 = call i4 @llvm.ctpop(i4 %t1)
+ %t3 = zext i4 %t2 to i32
+ ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v1i64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v1i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: cmp x9, x8
+; CHECK-NEXT: cset w0, lt
+; CHECK-NEXT: ret
+ %mask = icmp slt <1 x i64> %a, %b
+ %t1 = bitcast <1 x i1> %mask to i1
+ %t2 = call i1 @llvm.ctpop(i1 %t1)
+ %t3 = zext i1 %t2 to i32
+ ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI7_0
+; CHECK-NEXT: cmgt v0.2d, v1.2d, v0.2d
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_0]
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: addp d0, v0.2d
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and w8, w8, #0x3
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: cnt v0.8b, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %mask = icmp slt <2 x i64> %a, %b
+ %t1 = bitcast <2 x i1> %mask to i2
+ %t2 = call i2 @llvm.ctpop(i2 %t1)
+ %t3 = zext i2 %t2 to i32
+ ret i32 %t3
+}
+
+define i64 @vmask_popcount_i64_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI8_0
+; CHECK-NEXT: cmgt v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI8_0]
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: addv b0, v0.8b
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: cnt v0.8b, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %mask = icmp slt <8 x i8> %a, %b
+ %t1 = bitcast <8 x i1> %mask to i8
+ %t2 = call i8 @llvm.ctpop(i8 %t1)
+ %t3 = zext i8 %t2 to i64
+ ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI9_0
+; CHECK-NEXT: cmgt v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_0]
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: zip1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: cnt v0.8b, v0.8b
+; CHECK-NEXT: addv b0, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %mask = icmp slt <16 x i8> %a, %b
+ %t1 = bitcast <16 x i1> %mask to i16
+ %t2 = call i16 @llvm.ctpop(i16 %t1)
+ %t3 = zext i16 %t2 to i64
+ ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI10_0
+; CHECK-NEXT: cmgt v0.4h, v1.4h, v0.4h
+; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI10_0]
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: addv h0, v0.4h
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and w8, w8, #0xf
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: cnt v0.8b, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %mask = icmp slt <4 x i16> %a, %b
+ %t1 = bitcast <4 x i1> %mask to i4
+ %t2 = call i4 @llvm.ctpop(i4 %t1)
+ %t3 = zext i4 %t2 to i64
+ ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI11_0
+; CHECK-NEXT: cmgt v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI11_0]
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and w8, w8, #0xff
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: cnt v0.8b, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %mask = icmp slt <8 x i16> %a, %b
+ %t1 = bitcast <8 x i1> %mask to i8
+ %t2 = call i8 @llvm.ctpop(i8 %t1)
+ %t3 = zext i8 %t2 to i64
+ ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v2i32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI12_0
+; CHECK-NEXT: cmgt v0.2s, v1.2s, v0.2s
+; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI12_0]
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and w8, w8, #0x3
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: cnt v0.8b, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %mask = icmp slt <2 x i32> %a, %b
+ %t1 = bitcast <2 x i1> %mask to i2
+ %t2 = call i2 @llvm.ctpop(i2 %t1)
+ %t3 = zext i2 %t2 to i64
+ ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI13_0
+; CHECK-NEXT: cmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_0]
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: addv s0, v0.4s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and w8, w8, #0xf
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: cnt v0.8b, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %mask = icmp slt <4 x i32> %a, %b
+ %t1 = bitcast <4 x i1> %mask to i4
+ %t2 = call i4 @llvm.ctpop(i4 %t1)
+ %t3 = zext i4 %t2 to i64
+ ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v1i64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v1i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: cmp x9, x8
+; CHECK-NEXT: cset w0, lt
+; CHECK-NEXT: ret
+ %mask = icmp slt <1 x i64> %a, %b
+ %t1 = bitcast <1 x i1> %mask to i1
+ %t2 = call i1 @llvm.ctpop(i1 %t1)
+ %t3 = zext i1 %t2 to i64
+ ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI15_0
+; CHECK-NEXT: cmgt v0.2d, v1.2d, v0.2d
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0]
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: addp d0, v0.2d
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and w8, w8, #0x3
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: cnt v0.8b, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %mask = icmp slt <2 x i64> %a, %b
+ %t1 = bitcast <2 x i1> %mask to i2
+ %t2 = call i2 @llvm.ctpop(i2 %t1)
+ %t3 = zext i2 %t2 to i64
+ ret i64 %t3
+}
+
+define i32 @non_vmask_popcount_1(half %a) {
+; CHECK-LABEL: non_vmask_popcount_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and w8, w8, #0xffff
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: cnt v0.8b, v0.8b
+; CHECK-NEXT: addv b0, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %t1 = bitcast half %a to i16
+ %t2 = call i16 @llvm.ctpop(i16 %t1)
+ %t3 = zext i16 %t2 to i32
+ ret i32 %t3
+}
+
+define i32 @non_vmask_popcount_2(<8 x i16> %a) {
+; CHECK-LABEL: non_vmask_popcount_2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: umov w8, v0.b[0]
+; CHECK-NEXT: umov w9, v0.b[1]
+; CHECK-NEXT: umov w10, v0.b[2]
+; CHECK-NEXT: and w8, w8, #0x3
+; CHECK-NEXT: bfi w8, w9, #2, #2
+; CHECK-NEXT: umov w9, v0.b[3]
+; CHECK-NEXT: bfi w8, w10, #4, #2
+; CHECK-NEXT: umov w10, v0.b[4]
+; CHECK-NEXT: bfi w8, w9, #6, #2
+; CHECK-NEXT: umov w9, v0.b[5]
+; CHECK-NEXT: bfi w8, w10, #8, #2
+; CHECK-NEXT: umov w10, v0.b[6]
+; CHECK-NEXT: bfi w8, w9, #10, #2
+; CHECK-NEXT: umov w9, v0.b[7]
+; CHECK-NEXT: bfi w8, w10, #12, #2
+; CHECK-NEXT: orr w8, w8, w9, lsl #14
+; CHECK-NEXT: and w8, w8, #0xffff
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: cnt v0.8b, v0.8b
+; CHECK-NEXT: addv b0, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
+ %mask = trunc <8 x i16> %a to <8 x i2>
+ %t1 = bitcast <8 x i2> %mask to i16
+ %t2 = call i16 @llvm.ctpop(i16 %t1)
+ %t3 = zext i16 %t2 to i32
+ ret i32 %t3
+}
>From 601de5e618ffc707a2f7f9691080df5c55dd8169 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Tue, 4 Nov 2025 14:49:19 +0000
Subject: [PATCH 2/3] [LLVM][CodeGen][AArch64] Improve lowering of boolean
vector popcount operations.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 35 ++++-
llvm/test/CodeGen/AArch64/popcount_vmask.ll | 148 +++++-------------
2 files changed, 72 insertions(+), 111 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 60aa61e993b26..ac55b9e368a31 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -50,6 +50,7 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetCallingConv.h"
@@ -104,7 +105,6 @@
#include <vector>
using namespace llvm;
-using namespace llvm::PatternMatch;
#define DEBUG_TYPE "aarch64-lower"
@@ -1180,6 +1180,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::VECTOR_DEINTERLEAVE);
+ setTargetDAGCombine(ISD::CTPOP);
// In case of strict alignment, avoid an excessive number of byte wide stores.
MaxStoresPerMemsetOptSize = 8;
@@ -17591,6 +17592,7 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
// udot instruction.
if (SrcWidth * 4 <= DstWidth) {
if (all_of(I->users(), [&](auto *U) {
+ using namespace llvm::PatternMatch;
auto *SingleUser = cast<Instruction>(&*U);
if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
return true;
@@ -17862,6 +17864,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
// into shift / and masks. For the moment we do this just for uitofp (not
// zext) to avoid issues with widening instructions.
if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
+ using namespace llvm::PatternMatch;
return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
SI->getType()->getScalarSizeInBits() * 4 ==
SI->user_back()->getType()->getScalarSizeInBits();
@@ -27878,6 +27881,34 @@ static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG) {
{A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
}
+static SDValue performCTPOPCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ using namespace llvm::SDPatternMatch;
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
+ // ctpop(zext(bitcast(vector_mask))) -> neg(signed_reduce_add(vector_mask))
+ SDValue Mask;
+ if (sd_match(N->getOperand(0), m_ZExt(m_BitCast(m_Value(Mask))))) {
+ EVT VT = N->getValueType(0);
+ EVT MaskVT = Mask.getValueType();
+
+ if (VT.isVector() || !MaskVT.isFixedLengthVector() ||
+ MaskVT.getVectorElementType() != MVT::i1)
+ return SDValue();
+
+ SDLoc DL(N);
+ // Sign extend to best fit ZeroOrNegativeOneBooleanContent.
+ SDValue ExtMask =
+ DAG.getNode(ISD::SIGN_EXTEND, DL, MaskVT.changeElementType(VT), Mask);
+ SDValue NegPopCount = DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, ExtMask);
+ return DAG.getNegative(NegPopCount, DL, VT);
+ }
+
+ return SDValue();
+}
+
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -28223,6 +28254,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performScalarToVectorCombine(N, DCI, DAG);
case ISD::SHL:
return performSHLCombine(N, DCI, DAG);
+ case ISD::CTPOP:
+ return performCTPOPCombine(N, DCI, DAG);
}
return SDValue();
}
diff --git a/llvm/test/CodeGen/AArch64/popcount_vmask.ll b/llvm/test/CodeGen/AArch64/popcount_vmask.ll
index ae8a6667fdc74..e784ead2c9e5a 100644
--- a/llvm/test/CodeGen/AArch64/popcount_vmask.ll
+++ b/llvm/test/CodeGen/AArch64/popcount_vmask.ll
@@ -6,15 +6,11 @@ target triple = "aarch64-unknown-linux-gnu"
define i32 @vmask_popcount_i32_v8i8(<8 x i8> %a, <8 x i8> %b) {
; CHECK-LABEL: vmask_popcount_i32_v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI0_0
; CHECK-NEXT: cmgt v0.8b, v1.8b, v0.8b
-; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_0]
-; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: addv b0, v0.8b
+; CHECK-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-NEXT: saddlv s0, v0.8h
; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: neg w0, w8
; CHECK-NEXT: ret
%mask = icmp slt <8 x i8> %a, %b
%t1 = bitcast <8 x i1> %mask to i8
@@ -26,18 +22,15 @@ define i32 @vmask_popcount_i32_v8i8(<8 x i8> %a, <8 x i8> %b) {
define i32 @vmask_popcount_i32_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: vmask_popcount_i32_v16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI1_0
; CHECK-NEXT: cmgt v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: zip1 v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: sshll2 v1.8h, v0.16b, #0
+; CHECK-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-NEXT: saddl2 v2.4s, v0.8h, v1.8h
+; CHECK-NEXT: saddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: addv b0, v0.8b
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: neg w0, w8
; CHECK-NEXT: ret
%mask = icmp slt <16 x i8> %a, %b
%t1 = bitcast <16 x i1> %mask to i16
@@ -49,16 +42,10 @@ define i32 @vmask_popcount_i32_v16i8(<16 x i8> %a, <16 x i8> %b) {
define i32 @vmask_popcount_i32_v4i16(<4 x i16> %a, <4 x i16> %b) {
; CHECK-LABEL: vmask_popcount_i32_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI2_0
; CHECK-NEXT: cmgt v0.4h, v1.4h, v0.4h
-; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI2_0]
-; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: addv h0, v0.4h
+; CHECK-NEXT: saddlv s0, v0.4h
; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w8, w8, #0xf
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: neg w0, w8
; CHECK-NEXT: ret
%mask = icmp slt <4 x i16> %a, %b
%t1 = bitcast <4 x i1> %mask to i4
@@ -70,16 +57,10 @@ define i32 @vmask_popcount_i32_v4i16(<4 x i16> %a, <4 x i16> %b) {
define i32 @vmask_popcount_i32_v8i16(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: vmask_popcount_i32_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI3_0
; CHECK-NEXT: cmgt v0.8h, v1.8h, v0.8h
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: saddlv s0, v0.8h
; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w8, w8, #0xff
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: neg w0, w8
; CHECK-NEXT: ret
%mask = icmp slt <8 x i16> %a, %b
%t1 = bitcast <8 x i1> %mask to i8
@@ -91,16 +72,10 @@ define i32 @vmask_popcount_i32_v8i16(<8 x i16> %a, <8 x i16> %b) {
define i32 @vmask_popcount_i32_v2i32(<2 x i32> %a, <2 x i32> %b) {
; CHECK-LABEL: vmask_popcount_i32_v2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI4_0
; CHECK-NEXT: cmgt v0.2s, v1.2s, v0.2s
-; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_0]
-; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s
; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w8, w8, #0x3
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: neg w0, w8
; CHECK-NEXT: ret
%mask = icmp slt <2 x i32> %a, %b
%t1 = bitcast <2 x i1> %mask to i2
@@ -112,16 +87,10 @@ define i32 @vmask_popcount_i32_v2i32(<2 x i32> %a, <2 x i32> %b) {
define i32 @vmask_popcount_i32_v4i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: vmask_popcount_i32_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI5_0
; CHECK-NEXT: cmgt v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0]
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w8, w8, #0xf
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: neg w0, w8
; CHECK-NEXT: ret
%mask = icmp slt <4 x i32> %a, %b
%t1 = bitcast <4 x i1> %mask to i4
@@ -150,16 +119,11 @@ define i32 @vmask_popcount_i32_v1i64(<1 x i64> %a, <1 x i64> %b) {
define i32 @vmask_popcount_i32_v2i64(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: vmask_popcount_i32_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI7_0
; CHECK-NEXT: cmgt v0.2d, v1.2d, v0.2d
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_0]
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: addp d0, v0.2d
+; CHECK-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s
; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w8, w8, #0x3
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: neg w0, w8
; CHECK-NEXT: ret
%mask = icmp slt <2 x i64> %a, %b
%t1 = bitcast <2 x i1> %mask to i2
@@ -171,15 +135,11 @@ define i32 @vmask_popcount_i32_v2i64(<2 x i64> %a, <2 x i64> %b) {
define i64 @vmask_popcount_i64_v8i8(<8 x i8> %a, <8 x i8> %b) {
; CHECK-LABEL: vmask_popcount_i64_v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI8_0
; CHECK-NEXT: cmgt v0.8b, v1.8b, v0.8b
-; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI8_0]
-; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: addv b0, v0.8b
+; CHECK-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-NEXT: saddlv s0, v0.8h
; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: neg w0, w8
; CHECK-NEXT: ret
%mask = icmp slt <8 x i8> %a, %b
%t1 = bitcast <8 x i1> %mask to i8
@@ -191,18 +151,15 @@ define i64 @vmask_popcount_i64_v8i8(<8 x i8> %a, <8 x i8> %b) {
define i64 @vmask_popcount_i64_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: vmask_popcount_i64_v16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI9_0
; CHECK-NEXT: cmgt v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_0]
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: zip1 v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: sshll2 v1.8h, v0.16b, #0
+; CHECK-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-NEXT: saddl2 v2.4s, v0.8h, v1.8h
+; CHECK-NEXT: saddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: addv b0, v0.8b
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: neg w0, w8
; CHECK-NEXT: ret
%mask = icmp slt <16 x i8> %a, %b
%t1 = bitcast <16 x i1> %mask to i16
@@ -214,16 +171,10 @@ define i64 @vmask_popcount_i64_v16i8(<16 x i8> %a, <16 x i8> %b) {
define i64 @vmask_popcount_i64_v4i16(<4 x i16> %a, <4 x i16> %b) {
; CHECK-LABEL: vmask_popcount_i64_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI10_0
; CHECK-NEXT: cmgt v0.4h, v1.4h, v0.4h
-; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI10_0]
-; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: addv h0, v0.4h
+; CHECK-NEXT: saddlv s0, v0.4h
; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w8, w8, #0xf
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: neg w0, w8
; CHECK-NEXT: ret
%mask = icmp slt <4 x i16> %a, %b
%t1 = bitcast <4 x i1> %mask to i4
@@ -235,16 +186,10 @@ define i64 @vmask_popcount_i64_v4i16(<4 x i16> %a, <4 x i16> %b) {
define i64 @vmask_popcount_i64_v8i16(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: vmask_popcount_i64_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI11_0
; CHECK-NEXT: cmgt v0.8h, v1.8h, v0.8h
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI11_0]
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: saddlv s0, v0.8h
; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w8, w8, #0xff
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: neg w0, w8
; CHECK-NEXT: ret
%mask = icmp slt <8 x i16> %a, %b
%t1 = bitcast <8 x i1> %mask to i8
@@ -256,16 +201,10 @@ define i64 @vmask_popcount_i64_v8i16(<8 x i16> %a, <8 x i16> %b) {
define i64 @vmask_popcount_i64_v2i32(<2 x i32> %a, <2 x i32> %b) {
; CHECK-LABEL: vmask_popcount_i64_v2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI12_0
; CHECK-NEXT: cmgt v0.2s, v1.2s, v0.2s
-; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI12_0]
-; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s
; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w8, w8, #0x3
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: neg w0, w8
; CHECK-NEXT: ret
%mask = icmp slt <2 x i32> %a, %b
%t1 = bitcast <2 x i1> %mask to i2
@@ -277,16 +216,10 @@ define i64 @vmask_popcount_i64_v2i32(<2 x i32> %a, <2 x i32> %b) {
define i64 @vmask_popcount_i64_v4i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: vmask_popcount_i64_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI13_0
; CHECK-NEXT: cmgt v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_0]
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w8, w8, #0xf
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: neg w0, w8
; CHECK-NEXT: ret
%mask = icmp slt <4 x i32> %a, %b
%t1 = bitcast <4 x i1> %mask to i4
@@ -315,16 +248,11 @@ define i64 @vmask_popcount_i64_v1i64(<1 x i64> %a, <1 x i64> %b) {
define i64 @vmask_popcount_i64_v2i64(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: vmask_popcount_i64_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI15_0
; CHECK-NEXT: cmgt v0.2d, v1.2d, v0.2d
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0]
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: addp d0, v0.2d
+; CHECK-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s
; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w8, w8, #0x3
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: neg w0, w8
; CHECK-NEXT: ret
%mask = icmp slt <2 x i64> %a, %b
%t1 = bitcast <2 x i1> %mask to i2
>From 8c635b3884e7a3978a90d33ed127cc05d7629526 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Tue, 4 Nov 2025 17:51:57 +0000
Subject: [PATCH 3/3] Reformat as an early exit.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 29 ++++++++++---------
1 file changed, 15 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ac55b9e368a31..b6342220a3159 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27890,23 +27890,24 @@ static SDValue performCTPOPCombine(SDNode *N,
// ctpop(zext(bitcast(vector_mask))) -> neg(signed_reduce_add(vector_mask))
SDValue Mask;
- if (sd_match(N->getOperand(0), m_ZExt(m_BitCast(m_Value(Mask))))) {
- EVT VT = N->getValueType(0);
- EVT MaskVT = Mask.getValueType();
+ if (!sd_match(N->getOperand(0), m_ZExt(m_BitCast(m_Value(Mask)))))
+ return SDValue();
- if (VT.isVector() || !MaskVT.isFixedLengthVector() ||
- MaskVT.getVectorElementType() != MVT::i1)
- return SDValue();
+ EVT VT = N->getValueType(0);
+ EVT MaskVT = Mask.getValueType();
- SDLoc DL(N);
- // Sign extend to best fit ZeroOrNegativeOneBooleanContent.
- SDValue ExtMask =
- DAG.getNode(ISD::SIGN_EXTEND, DL, MaskVT.changeElementType(VT), Mask);
- SDValue NegPopCount = DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, ExtMask);
- return DAG.getNegative(NegPopCount, DL, VT);
- }
+ if (VT.isVector() || !MaskVT.isFixedLengthVector() ||
+ MaskVT.getVectorElementType() != MVT::i1)
+ return SDValue();
- return SDValue();
+ EVT ReduceInVT =
+ EVT::getVectorVT(*DAG.getContext(), VT, MaskVT.getVectorElementCount());
+
+ SDLoc DL(N);
+ // Sign extend to best fit ZeroOrNegativeOneBooleanContent.
+ SDValue ExtMask = DAG.getNode(ISD::SIGN_EXTEND, DL, ReduceInVT, Mask);
+ SDValue NegPopCount = DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, ExtMask);
+ return DAG.getNegative(NegPopCount, DL, VT);
}
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
More information about the llvm-commits
mailing list