[llvm] [AArch64] Fix active.lane.mask(0, cttz.elts(x)) -> 'brkb' transform (PR #180177)

Tue Feb 10 09:30:11 PST 2026

https://github.com/sdesmalen-arm updated https://github.com/llvm/llvm-project/pull/180177

>From dc75c215ebb0c84c2c2c704d468513492f852240 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Fri, 6 Feb 2026 10:22:44 +0000
Subject: [PATCH 1/4] Pre-commit test

---
 .../CodeGen/AArch64/sve-mask-partition.ll     | 52 +++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/sve-mask-partition.ll b/llvm/test/CodeGen/AArch64/sve-mask-partition.ll
index 8b712bd7e42a7..e2e7d80545dc8 100644
--- a/llvm/test/CodeGen/AArch64/sve-mask-partition.ll
+++ b/llvm/test/CodeGen/AArch64/sve-mask-partition.ll
@@ -558,3 +558,55 @@ define <vscale x 16 x i1> @mask_exclude_active_nxv16_nonzero_lower_bound(<vscale
   %mask.out = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 1, i64 %tz.elts)
   ret <vscale x 16 x i1> %mask.out
 }
+
+define <vscale x 4 x i1> @mask_exclude_active_narrower_result_type(<vscale x 8 x i1> %mask.in) {
+; CHECK-LABEL: mask_exclude_active_narrower_result_type:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    ret
+  %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 8 x i1> %mask.in, i1 false)
+  %mask.out = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %tz.elts)
+  ret <vscale x 4 x i1> %mask.out
+}
+
+define <vscale x 16 x i1> @mask_exclude_active_wider_result_type(<vscale x 8 x i1> %mask.in) {
+; CHECK-LABEL: mask_exclude_active_wider_result_type:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    ret
+  %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 8 x i1> %mask.in, i1 false)
+  %mask.out = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %tz.elts)
+  ret <vscale x 16 x i1> %mask.out
+}
+
+define <4 x i1> @mask_exclude_active_narrower_result_type_fixed(<8 x i1> %mask.in) {
+; CHECK-LABEL: mask_exclude_active_narrower_result_type_fixed:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.8b, v0.8b, #7
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    cmpne p1.b, p0/z, z0.b, #0
+; CHECK-NEXT:    brkb p0.b, p0/z, p1.b
+; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
+  %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<8 x i1> %mask.in, i1 false)
+  %mask.out = call <4 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %tz.elts)
+  ret <4 x i1> %mask.out
+}
+
+define <16 x i1> @mask_exclude_active_wider_result_type_fixed(<8 x i1> %mask.in) {
+; CHECK-LABEL: mask_exclude_active_wider_result_type_fixed:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.8b, v0.8b, #7
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    cmpne p1.b, p0/z, z0.b, #0
+; CHECK-NEXT:    brkb p0.b, p0/z, p1.b
+; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<8 x i1> %mask.in, i1 false)
+  %mask.out = call <16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %tz.elts)
+  ret <16 x i1> %mask.out
+}

>From fb96582be5ac6268522bc211a1d6d7ff0bb7180c Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Fri, 6 Feb 2026 10:58:01 +0000
Subject: [PATCH 2/4] Don't optimize to 'brkb' if the result type and mask
 don't match.

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 5 ++++-
 llvm/test/CodeGen/AArch64/sve-mask-partition.ll | 6 ++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 94216f6572f0a..20513411558e7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6049,11 +6049,14 @@ static SDValue optimizeBrk(SDNode *N, SelectionDAG &DAG) {
   // We're looking for an upper bound based on CTTZ_ELTS; this would be selected
   // as a cntp(brk(Pg, Mask)), but if we're just going to make a whilelo based
   // on that then we just need the brk.
-  if (Upper.getOpcode() != AArch64ISD::CTTZ_ELTS || !VT.isScalableVector())
+  if (Upper.getOpcode() != AArch64ISD::CTTZ_ELTS || !VT.isScalableVector() ||
+      Upper.getOperand(0).getValueType() != VT)
     return SDValue();
 
   SDValue Pg = Upper->getOperand(0);
   SDValue Mask = Upper->getOperand(1);
+  assert(Pg.getValueType() == Mask.getValueType() &&
+         "predicate types must match");
 
   // brk{a,b} only support .b forms, so cast to make sure all our p regs match.
   Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
diff --git a/llvm/test/CodeGen/AArch64/sve-mask-partition.ll b/llvm/test/CodeGen/AArch64/sve-mask-partition.ll
index e2e7d80545dc8..30c5220decda7 100644
--- a/llvm/test/CodeGen/AArch64/sve-mask-partition.ll
+++ b/llvm/test/CodeGen/AArch64/sve-mask-partition.ll
@@ -564,6 +564,8 @@ define <vscale x 4 x i1> @mask_exclude_active_narrower_result_type(<vscale x 8 x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p1.h
 ; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x8, p0, p0.h
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
 ; CHECK-NEXT:    ret
   %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 8 x i1> %mask.in, i1 false)
   %mask.out = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %tz.elts)
@@ -575,6 +577,8 @@ define <vscale x 16 x i1> @mask_exclude_active_wider_result_type(<vscale x 8 x i
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p1.h
 ; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x8, p0, p0.h
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
 ; CHECK-NEXT:    ret
   %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 8 x i1> %mask.in, i1 false)
   %mask.out = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %tz.elts)
@@ -588,6 +592,8 @@ define <4 x i1> @mask_exclude_active_narrower_result_type_fixed(<8 x i1> %mask.i
 ; CHECK-NEXT:    ptrue p0.b, vl8
 ; CHECK-NEXT:    cmpne p1.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    brkb p0.b, p0/z, p1.b
+; CHECK-NEXT:    cntp x8, p0, p0.b
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret

>From 95ead3493f931ae6568812b7a4c81f6648f0de34 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Fri, 6 Feb 2026 13:48:04 +0000
Subject: [PATCH 3/4] Move assert to verifyTargetNode

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp     | 2 --
 llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp | 4 ++++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 20513411558e7..0929a37af6850 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6055,8 +6055,6 @@ static SDValue optimizeBrk(SDNode *N, SelectionDAG &DAG) {
 
   SDValue Pg = Upper->getOperand(0);
   SDValue Mask = Upper->getOperand(1);
-  assert(Pg.getValueType() == Mask.getValueType() &&
-         "predicate types must match");
 
   // brk{a,b} only support .b forms, so cast to make sure all our p regs match.
   Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index dbbb871bcec33..8653c72a5c96a 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -48,6 +48,10 @@ void AArch64SelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG,
 #ifndef NDEBUG
   // Some additional checks not yet implemented by verifyTargetNode.
   switch (N->getOpcode()) {
+  case AArch64ISD::CTTZ_ELTS:
+    assert(N->getOperand(0).getValueType() == N->getOperand(1).getValueType() &&
+           "Expected the mask and general-predicate have matching types");
+    break;
   case AArch64ISD::SADDWT:
   case AArch64ISD::SADDWB:
   case AArch64ISD::UADDWT:

>From 5a3f41067778dbe343965ad076bc3495f5c3d84e Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Tue, 10 Feb 2026 17:28:26 +0000
Subject: [PATCH 4/4] Address comments

---
 .../Target/AArch64/AArch64SelectionDAGInfo.cpp   |  2 +-
 llvm/test/CodeGen/AArch64/sve-mask-partition.ll  | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 8653c72a5c96a..3c208e01d7f63 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -50,7 +50,7 @@ void AArch64SelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG,
   switch (N->getOpcode()) {
   case AArch64ISD::CTTZ_ELTS:
     assert(N->getOperand(0).getValueType() == N->getOperand(1).getValueType() &&
-           "Expected the mask and general-predicate have matching types");
+           "Expected the general-predicate and mask to have matching types");
     break;
   case AArch64ISD::SADDWT:
   case AArch64ISD::SADDWB:
diff --git a/llvm/test/CodeGen/AArch64/sve-mask-partition.ll b/llvm/test/CodeGen/AArch64/sve-mask-partition.ll
index 30c5220decda7..e4bad94f08b45 100644
--- a/llvm/test/CodeGen/AArch64/sve-mask-partition.ll
+++ b/llvm/test/CodeGen/AArch64/sve-mask-partition.ll
@@ -567,8 +567,8 @@ define <vscale x 4 x i1> @mask_exclude_active_narrower_result_type(<vscale x 8 x
 ; CHECK-NEXT:    cntp x8, p0, p0.h
 ; CHECK-NEXT:    whilelo p0.s, xzr, x8
 ; CHECK-NEXT:    ret
-  %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 8 x i1> %mask.in, i1 false)
-  %mask.out = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %tz.elts)
+  %tz.elts = call i64 @llvm.experimental.cttz.elts(<vscale x 8 x i1> %mask.in, i1 false)
+  %mask.out = call <vscale x 4 x i1> @llvm.get.active.lane.mask(i64 0, i64 %tz.elts)
   ret <vscale x 4 x i1> %mask.out
 }
 
@@ -580,8 +580,8 @@ define <vscale x 16 x i1> @mask_exclude_active_wider_result_type(<vscale x 8 x i
 ; CHECK-NEXT:    cntp x8, p0, p0.h
 ; CHECK-NEXT:    whilelo p0.b, xzr, x8
 ; CHECK-NEXT:    ret
-  %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 8 x i1> %mask.in, i1 false)
-  %mask.out = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %tz.elts)
+  %tz.elts = call i64 @llvm.experimental.cttz.elts(<vscale x 8 x i1> %mask.in, i1 false)
+  %mask.out = call <vscale x 16 x i1> @llvm.get.active.lane.mask(i64 0, i64 %tz.elts)
   ret <vscale x 16 x i1> %mask.out
 }
 
@@ -597,8 +597,8 @@ define <4 x i1> @mask_exclude_active_narrower_result_type_fixed(<8 x i1> %mask.i
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-  %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<8 x i1> %mask.in, i1 false)
-  %mask.out = call <4 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %tz.elts)
+  %tz.elts = call i64 @llvm.experimental.cttz.elts(<8 x i1> %mask.in, i1 false)
+  %mask.out = call <4 x i1> @llvm.get.active.lane.mask(i64 0, i64 %tz.elts)
   ret <4 x i1> %mask.out
 }
 
@@ -612,7 +612,7 @@ define <16 x i1> @mask_exclude_active_wider_result_type_fixed(<8 x i1> %mask.in)
 ; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-  %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<8 x i1> %mask.in, i1 false)
-  %mask.out = call <16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %tz.elts)
+  %tz.elts = call i64 @llvm.experimental.cttz.elts(<8 x i1> %mask.in, i1 false)
+  %mask.out = call <16 x i1> @llvm.get.active.lane.mask(i64 0, i64 %tz.elts)
   ret <16 x i1> %mask.out
 }