[llvm] [AArch64] Use brk{a, b} for a lane mask from cttz.elts (PR #178674)
Graham Hunter via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 29 07:19:38 PST 2026
https://github.com/huntergr-arm created https://github.com/llvm/llvm-project/pull/178674
cttz.elts is usually lowered (for SVE) to a brkb followed by a cntp. If we then want a mask based on that (say, for early exit masking) then we would use a whilelo from 0 to the result of cntp. But that just gives us the same mask as the initial brkb, so we can just remove the cntp and the whilelo.
Brka matches the extra +1 in the pattern.
Note: The fixed-length codegen exhibits the bug @fhahn found in #178644. This will need to be fixed separately.
>From e6c0f867a06b739f82a05bbce2917487e2bca0b3 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Thu, 29 Jan 2026 14:46:47 +0000
Subject: [PATCH 1/2] Initial tests
---
.../CodeGen/AArch64/sve-mask-partition.ll | 268 ++++++++++++++++++
1 file changed, 268 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/sve-mask-partition.ll
diff --git a/llvm/test/CodeGen/AArch64/sve-mask-partition.ll b/llvm/test/CodeGen/AArch64/sve-mask-partition.ll
new file mode 100644
index 0000000000000..7dd768f52e21e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-mask-partition.ll
@@ -0,0 +1,268 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s
+
+;; Scalable
+define <vscale x 16 x i1> @mask_exclude_active_nxv16(<vscale x 16 x i1> %mask.in) {
+; CHECK-LABEL: mask_exclude_active_nxv16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x8, p0, p0.b
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: ret
+ %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> %mask.in, i1 false)
+ %mask.out = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %tz.elts)
+ ret <vscale x 16 x i1> %mask.out
+}
+
+define <vscale x 8 x i1> @mask_exclude_active_nxv8(<vscale x 8 x i1> %mask.in) {
+; CHECK-LABEL: mask_exclude_active_nxv8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x8, p0, p0.h
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ret
+ %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> %mask.in, i1 false)
+ %mask.out = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %tz.elts)
+ ret <vscale x 8 x i1> %mask.out
+}
+
+define <vscale x 4 x i1> @mask_exclude_active_nxv4(<vscale x 4 x i1> %mask.in) {
+; CHECK-LABEL: mask_exclude_active_nxv4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x8, p0, p0.s
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ret
+ %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> %mask.in, i1 false)
+ %mask.out = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %tz.elts)
+ ret <vscale x 4 x i1> %mask.out
+}
+
+define <vscale x 2 x i1> @mask_exclude_active_nxv2(<vscale x 2 x i1> %mask.in) {
+; CHECK-LABEL: mask_exclude_active_nxv2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x8, p0, p0.d
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ret
+ %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> %mask.in, i1 false)
+ %mask.out = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %tz.elts)
+ ret <vscale x 2 x i1> %mask.out
+}
+
+define <vscale x 16 x i1> @mask_include_active_nxv16(<vscale x 16 x i1> %mask.in) {
+; CHECK-LABEL: mask_include_active_nxv16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x8, p0, p0.b
+; CHECK-NEXT: add x8, x8, #1
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: ret
+ %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> %mask.in, i1 false)
+ %inc = add i64 %tz.elts, 1
+ %mask.out = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %inc)
+ ret <vscale x 16 x i1> %mask.out
+}
+
+define <vscale x 8 x i1> @mask_include_active_nxv8(<vscale x 8 x i1> %mask.in) {
+; CHECK-LABEL: mask_include_active_nxv8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x8, p0, p0.h
+; CHECK-NEXT: add x8, x8, #1
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ret
+ %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> %mask.in, i1 false)
+ %inc = add i64 %tz.elts, 1
+ %mask.out = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %inc)
+ ret <vscale x 8 x i1> %mask.out
+}
+
+define <vscale x 4 x i1> @mask_include_active_nxv4(<vscale x 4 x i1> %mask.in) {
+; CHECK-LABEL: mask_include_active_nxv4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x8, p0, p0.s
+; CHECK-NEXT: add x8, x8, #1
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ret
+ %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> %mask.in, i1 false)
+ %inc = add i64 %tz.elts, 1
+ %mask.out = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %inc)
+ ret <vscale x 4 x i1> %mask.out
+}
+
+define <vscale x 2 x i1> @mask_include_active_nxv2(<vscale x 2 x i1> %mask.in) {
+; CHECK-LABEL: mask_include_active_nxv2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x8, p0, p0.d
+; CHECK-NEXT: add x8, x8, #1
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ret
+ %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> %mask.in, i1 false)
+ %inc = add i64 %tz.elts, 1
+ %mask.out = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %inc)
+ ret <vscale x 2 x i1> %mask.out
+}
+
+;; Fixed
+define <16 x i1> @mask_exclude_active_v16(<16 x i1> %mask.in) {
+; CHECK-LABEL: mask_exclude_active_v16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.16b, v0.16b, #7
+; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x8, p0, p0.b
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
+ %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> %mask.in, i1 false)
+ %mask.out = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 %tz.elts)
+ ret <16 x i1> %mask.out
+}
+
+define <8 x i1> @mask_exclude_active_v8(<8 x i1> %mask.in) {
+; CHECK-LABEL: mask_exclude_active_v8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.8b, v0.8b, #7
+; CHECK-NEXT: ptrue p0.b, vl8
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x8, p0, p0.b
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
+ %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> %mask.in, i1 false)
+ %mask.out = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 0, i64 %tz.elts)
+ ret <8 x i1> %mask.out
+}
+
+define <4 x i1> @mask_exclude_active_v4(<4 x i1> %mask.in) {
+; CHECK-LABEL: mask_exclude_active_v4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.4h, v0.4h, #15
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x8, p0, p0.h
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
+ %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> %mask.in, i1 false)
+ %mask.out = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 %tz.elts)
+ ret <4 x i1> %mask.out
+}
+
+define <2 x i1> @mask_exclude_active_v2(<2 x i1> %mask.in) {
+; CHECK-LABEL: mask_exclude_active_v2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.2s, v0.2s, #31
+; CHECK-NEXT: ptrue p0.s, vl2
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x8, p0, p0.s
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
+ %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> %mask.in, i1 false)
+ %mask.out = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 %tz.elts)
+ ret <2 x i1> %mask.out
+}
+
+define <16 x i1> @mask_include_active_v16(<16 x i1> %mask.in) {
+; CHECK-LABEL: mask_include_active_v16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.16b, v0.16b, #7
+; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x8, p0, p0.b
+; CHECK-NEXT: add x8, x8, #1
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
+ %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> %mask.in, i1 false)
+ %inc = add i64 %tz.elts, 1
+ %mask.out = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 %inc)
+ ret <16 x i1> %mask.out
+}
+
+define <8 x i1> @mask_include_active_v8(<8 x i1> %mask.in) {
+; CHECK-LABEL: mask_include_active_v8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.8b, v0.8b, #7
+; CHECK-NEXT: ptrue p0.b, vl8
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x8, p0, p0.b
+; CHECK-NEXT: add x8, x8, #1
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
+ %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> %mask.in, i1 false)
+ %inc = add i64 %tz.elts, 1
+ %mask.out = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 0, i64 %inc)
+ ret <8 x i1> %mask.out
+}
+
+define <4 x i1> @mask_include_active_v4(<4 x i1> %mask.in) {
+; CHECK-LABEL: mask_include_active_v4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.4h, v0.4h, #15
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x8, p0, p0.h
+; CHECK-NEXT: add x8, x8, #1
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
+ %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> %mask.in, i1 false)
+ %inc = add i64 %tz.elts, 1
+ %mask.out = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 %inc)
+ ret <4 x i1> %mask.out
+}
+
+define <2 x i1> @mask_include_active_v2(<2 x i1> %mask.in) {
+; CHECK-LABEL: mask_include_active_v2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.2s, v0.2s, #31
+; CHECK-NEXT: ptrue p0.s, vl2
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x8, p0, p0.s
+; CHECK-NEXT: add x8, x8, #1
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
+ %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> %mask.in, i1 false)
+ %inc = add i64 %tz.elts, 1
+ %mask.out = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 %inc)
+ ret <2 x i1> %mask.out
+}
>From 5f096a08f9c391610b8a6161f14134e5bc0b5e6a Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Thu, 29 Jan 2026 14:48:00 +0000
Subject: [PATCH 2/2] [AArch64] Use brk{a,b} for a lane mask from cttz.elts
cttz.elts is usually lowered (for SVE) to a brkb followed by a cntp.
If we then want a mask based on that (say, for early exit masking)
then we would use a whilelo from 0 to the result of cntp. But that
just gives us the same mask as the initial brkb, so we can just
remove the cntp and the whilelo.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 40 +++++++++++++
.../CodeGen/AArch64/sve-mask-partition.ll | 56 +++----------------
2 files changed, 48 insertions(+), 48 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6c0544005e1dd..7f54a24f4da05 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6022,6 +6022,43 @@ static SDValue optimizeIncrementingWhile(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Match active.lane.mask(cttz.elts(x)) -> brkb(x)
+// Match active.lane.mask(add(cttz.elts(x), 1)) -> brka(x)
+static SDValue optimizeBrk(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+
+ // TODO: Do we need to do anything for fixed types post legalization?
+ if (!VT.isScalableVT())
+ return SDValue();
+
+ SDValue Op = N->getOperand(1);
+
+ // Default to brkb, switch to brka if we find a +1.
+ unsigned BrkID = Intrinsic::aarch64_sve_brkb_z;
+ if (Op->getOpcode() == ISD::ADD && isOneOrOneSplat(Op.getOperand(1))) {
+ Op = Op.getOperand(0);
+ BrkID = Intrinsic::aarch64_sve_brka_z;
+ }
+
+ if (Op.getOpcode() == AArch64ISD::CTTZ_ELTS) {
+ SDValue Mask = Op->getOperand(0);
+ SDLoc DL(N);
+ SDValue PTrue = getPTrue(DAG, DL, VT, AArch64SVEPredPattern::all);
+
+ // brk{a,b} only support .b forms, so reinterpret to make sure all our
+ // p regs will match.
+ PTrue = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, PTrue);
+ SDValue MaskR =
+ DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Mask);
+ SDValue ID = DAG.getTargetConstant(BrkID, DL, MVT::i64);
+ SDValue Brk = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv16i1, ID,
+ PTrue, MaskR);
+ return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Brk);
+ }
+
+ return SDValue();
+}
+
// Returns a safe bitcast between two scalable vector predicates, where
// any newly created lanes from a widening bitcast are defined as zero.
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) {
@@ -19379,6 +19416,9 @@ performActiveLaneMaskCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
if (DCI.isBeforeLegalize())
return SDValue();
+ if (SDValue Brk = optimizeBrk(N, DCI.DAG))
+ return Brk;
+
if (SDValue While = optimizeIncrementingWhile(N, DCI.DAG, /*IsSigned=*/false,
/*IsEqual=*/false))
return While;
diff --git a/llvm/test/CodeGen/AArch64/sve-mask-partition.ll b/llvm/test/CodeGen/AArch64/sve-mask-partition.ll
index 7dd768f52e21e..760b46bf6ebf1 100644
--- a/llvm/test/CodeGen/AArch64/sve-mask-partition.ll
+++ b/llvm/test/CodeGen/AArch64/sve-mask-partition.ll
@@ -7,8 +7,6 @@ define <vscale x 16 x i1> @mask_exclude_active_nxv16(<vscale x 16 x i1> %mask.in
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x8, p0, p0.b
-; CHECK-NEXT: whilelo p0.b, xzr, x8
; CHECK-NEXT: ret
%tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> %mask.in, i1 false)
%mask.out = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %tz.elts)
@@ -20,8 +18,6 @@ define <vscale x 8 x i1> @mask_exclude_active_nxv8(<vscale x 8 x i1> %mask.in) {
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.h
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x8, p0, p0.h
-; CHECK-NEXT: whilelo p0.h, xzr, x8
; CHECK-NEXT: ret
%tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> %mask.in, i1 false)
%mask.out = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %tz.elts)
@@ -33,8 +29,6 @@ define <vscale x 4 x i1> @mask_exclude_active_nxv4(<vscale x 4 x i1> %mask.in) {
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.s
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x8, p0, p0.s
-; CHECK-NEXT: whilelo p0.s, xzr, x8
; CHECK-NEXT: ret
%tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> %mask.in, i1 false)
%mask.out = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %tz.elts)
@@ -46,8 +40,6 @@ define <vscale x 2 x i1> @mask_exclude_active_nxv2(<vscale x 2 x i1> %mask.in) {
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x8, p0, p0.d
-; CHECK-NEXT: whilelo p0.d, xzr, x8
; CHECK-NEXT: ret
%tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> %mask.in, i1 false)
%mask.out = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %tz.elts)
@@ -58,10 +50,7 @@ define <vscale x 16 x i1> @mask_include_active_nxv16(<vscale x 16 x i1> %mask.in
; CHECK-LABEL: mask_include_active_nxv16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.b
-; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x8, p0, p0.b
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: brka p0.b, p1/z, p0.b
; CHECK-NEXT: ret
%tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> %mask.in, i1 false)
%inc = add i64 %tz.elts, 1
@@ -73,10 +62,7 @@ define <vscale x 8 x i1> @mask_include_active_nxv8(<vscale x 8 x i1> %mask.in) {
; CHECK-LABEL: mask_include_active_nxv8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.h
-; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x8, p0, p0.h
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: brka p0.b, p1/z, p0.b
; CHECK-NEXT: ret
%tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> %mask.in, i1 false)
%inc = add i64 %tz.elts, 1
@@ -88,10 +74,7 @@ define <vscale x 4 x i1> @mask_include_active_nxv4(<vscale x 4 x i1> %mask.in) {
; CHECK-LABEL: mask_include_active_nxv4:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.s
-; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x8, p0, p0.s
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: brka p0.b, p1/z, p0.b
; CHECK-NEXT: ret
%tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> %mask.in, i1 false)
%inc = add i64 %tz.elts, 1
@@ -103,10 +86,7 @@ define <vscale x 2 x i1> @mask_include_active_nxv2(<vscale x 2 x i1> %mask.in) {
; CHECK-LABEL: mask_include_active_nxv2:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x8, p0, p0.d
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: brka p0.b, p1/z, p0.b
; CHECK-NEXT: ret
%tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> %mask.in, i1 false)
%inc = add i64 %tz.elts, 1
@@ -123,8 +103,6 @@ define <16 x i1> @mask_exclude_active_v16(<16 x i1> %mask.in) {
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x8, p0, p0.b
-; CHECK-NEXT: whilelo p0.b, xzr, x8
; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
@@ -141,8 +119,6 @@ define <8 x i1> @mask_exclude_active_v8(<8 x i1> %mask.in) {
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x8, p0, p0.b
-; CHECK-NEXT: whilelo p0.b, xzr, x8
; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
@@ -159,8 +135,6 @@ define <4 x i1> @mask_exclude_active_v4(<4 x i1> %mask.in) {
; CHECK-NEXT: ptrue p1.h
; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x8, p0, p0.h
-; CHECK-NEXT: whilelo p0.h, xzr, x8
; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
@@ -177,8 +151,6 @@ define <2 x i1> @mask_exclude_active_v2(<2 x i1> %mask.in) {
; CHECK-NEXT: ptrue p1.s
; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x8, p0, p0.s
-; CHECK-NEXT: whilelo p0.s, xzr, x8
; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
@@ -194,10 +166,7 @@ define <16 x i1> @mask_include_active_v16(<16 x i1> %mask.in) {
; CHECK-NEXT: ptrue p0.b, vl16
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
-; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x8, p0, p0.b
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: brka p0.b, p1/z, p0.b
; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
@@ -214,10 +183,7 @@ define <8 x i1> @mask_include_active_v8(<8 x i1> %mask.in) {
; CHECK-NEXT: ptrue p0.b, vl8
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
-; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x8, p0, p0.b
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: brka p0.b, p1/z, p0.b
; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
@@ -234,10 +200,7 @@ define <4 x i1> @mask_include_active_v4(<4 x i1> %mask.in) {
; CHECK-NEXT: ptrue p0.h, vl4
; CHECK-NEXT: ptrue p1.h
; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
-; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x8, p0, p0.h
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: brka p0.b, p1/z, p0.b
; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
@@ -254,10 +217,7 @@ define <2 x i1> @mask_include_active_v2(<2 x i1> %mask.in) {
; CHECK-NEXT: ptrue p0.s, vl2
; CHECK-NEXT: ptrue p1.s
; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
-; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x8, p0, p0.s
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: brka p0.b, p1/z, p0.b
; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
More information about the llvm-commits
mailing list