[llvm] d9be51c - [AArch64] Improve code generation for experimental.cttz.elts (#91505)
via llvm-commits
llvm-commits at lists.llvm.org
Tue May 14 01:49:57 PDT 2024
Author: Hari Limaye
Date: 2024-05-14T09:49:53+01:00
New Revision: d9be51ce68b743bde4d73b6858c454e09df341c5
URL: https://github.com/llvm/llvm-project/commit/d9be51ce68b743bde4d73b6858c454e09df341c5
DIFF: https://github.com/llvm/llvm-project/commit/d9be51ce68b743bde4d73b6858c454e09df341c5.diff
LOG: [AArch64] Improve code generation for experimental.cttz.elts (#91505)
This patch extends support for lowering the experimental.cttz.elts
intrinsic to BRKB + CNTP instruction sequences, using this lowering for
all legal predicate types. An unused parameter is also removed from some
of the related regression tests.
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2aa328e0a1270..33cc8ffaf85d5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1868,7 +1868,12 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
}
bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
- return !Subtarget->hasSVEorSME() || VT != MVT::nxv16i1;
+ if (!Subtarget->hasSVEorSME())
+ return true;
+
+ // We can only use the BRKB + CNTP sequence with legal predicate types.
+ return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
+ VT != MVT::nxv2i1;
}
void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index d4405a230613c..bd5de628d8529 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2082,6 +2082,18 @@ let Predicates = [HasSVEorSME] in {
def : Pat<(i64 (AArch64CttzElts nxv16i1:$Op1)),
(CNTP_XPP_B (BRKB_PPzP (PTRUE_B 31), PPR:$Op1),
(BRKB_PPzP (PTRUE_B 31), PPR:$Op1))>;
+
+ def : Pat<(i64 (AArch64CttzElts nxv8i1:$Op1)),
+ (CNTP_XPP_H (BRKB_PPzP (PTRUE_H 31), PPR:$Op1),
+ (BRKB_PPzP (PTRUE_H 31), PPR:$Op1))>;
+
+ def : Pat<(i64 (AArch64CttzElts nxv4i1:$Op1)),
+ (CNTP_XPP_S (BRKB_PPzP (PTRUE_S 31), PPR:$Op1),
+ (BRKB_PPzP (PTRUE_S 31), PPR:$Op1))>;
+
+ def : Pat<(i64 (AArch64CttzElts nxv2i1:$Op1)),
+ (CNTP_XPP_D (BRKB_PPzP (PTRUE_D 31), PPR:$Op1),
+ (BRKB_PPzP (PTRUE_D 31), PPR:$Op1))>;
}
defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb", add, int_aarch64_sve_cntb>;
@@ -2175,6 +2187,30 @@ let Predicates = [HasSVEorSME] in {
(INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Op1, sub_32)),
sub_32)>;
+ def : Pat<(i64 (add GPR64:$Op1, (i64 (AArch64CttzElts nxv8i1:$Op2)))),
+ (INCP_XP_H (BRKB_PPzP (PTRUE_H 31), PPR:$Op2), GPR64:$Op1)>;
+
+ def : Pat<(i32 (add GPR32:$Op1, (trunc (i64 (AArch64CttzElts nxv8i1:$Op2))))),
+ (EXTRACT_SUBREG (INCP_XP_H (BRKB_PPzP (PTRUE_H 31), PPR:$Op2),
+ (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Op1, sub_32)),
+ sub_32)>;
+
+ def : Pat<(i64 (add GPR64:$Op1, (i64 (AArch64CttzElts nxv4i1:$Op2)))),
+ (INCP_XP_S (BRKB_PPzP (PTRUE_S 31), PPR:$Op2), GPR64:$Op1)>;
+
+ def : Pat<(i32 (add GPR32:$Op1, (trunc (i64 (AArch64CttzElts nxv4i1:$Op2))))),
+ (EXTRACT_SUBREG (INCP_XP_S (BRKB_PPzP (PTRUE_S 31), PPR:$Op2),
+ (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Op1, sub_32)),
+ sub_32)>;
+
+ def : Pat<(i64 (add GPR64:$Op1, (i64 (AArch64CttzElts nxv2i1:$Op2)))),
+ (INCP_XP_D (BRKB_PPzP (PTRUE_D 31), PPR:$Op2), GPR64:$Op1)>;
+
+ def : Pat<(i32 (add GPR32:$Op1, (trunc (i64 (AArch64CttzElts nxv2i1:$Op2))))),
+ (EXTRACT_SUBREG (INCP_XP_D (BRKB_PPzP (PTRUE_D 31), PPR:$Op2),
+ (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Op1, sub_32)),
+ sub_32)>;
+
defm INDEX_RR : sve_int_index_rr<"index", AArch64mul_p_oneuse>;
defm INDEX_IR : sve_int_index_ir<"index", AArch64mul_p, AArch64mul_p_oneuse>;
defm INDEX_RI : sve_int_index_ri<"index">;
diff --git a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
index 01dc086d93853..cc1532ee33dcf 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
@@ -3,14 +3,14 @@
define void @foo_no_vscale_range() {
; CHECK-LABEL: 'foo_no_vscale_range'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 true)
@@ -23,14 +23,14 @@ define void @foo_no_vscale_range() {
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res.i32.v32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 false)
@@ -95,24 +95,24 @@ define void @foo_no_vscale_range() {
define void @foo_vscale_range_1_16() vscale_range(1,16) {
; CHECK-LABEL: 'foo_vscale_range_1_16'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -144,24 +144,24 @@ define void @foo_vscale_range_1_16() vscale_range(1,16) {
define void @foo_vscale_range_1_16384() vscale_range(1,16384) {
; CHECK-LABEL: 'foo_vscale_range_1_16384'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
index 9bd2ed240810d..211237542a15b 100644
--- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
+++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
@@ -4,25 +4,6 @@
; WITH VSCALE RANGE
-define i64 @ctz_nxv8i1(<vscale x 8 x i1> %a) #0 {
-; CHECK-LABEL: ctz_nxv8i1:
-; CHECK: // %bb.0:
-; CHECK-NEXT: index z0.h, #0, #-1
-; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: cnth x9
-; CHECK-NEXT: inch z0.h
-; CHECK-NEXT: and z0.d, z0.d, z1.d
-; CHECK-NEXT: and z0.h, z0.h, #0xff
-; CHECK-NEXT: umaxv h0, p0, z0.h
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: sub w8, w9, w8
-; CHECK-NEXT: and x0, x8, #0xff
-; CHECK-NEXT: ret
- %res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> %a, i1 0)
- ret i64 %res
-}
-
define i32 @ctz_nxv32i1(<vscale x 32 x i1> %a) #0 {
; CHECK-LABEL: ctz_nxv32i1:
; CHECK: // %bb.0:
@@ -156,41 +137,166 @@ define i64 @vscale_4096_poison(<vscale x 16 x i8> %a) #1 {
ret i64 %res
}
-; NO VSCALE RANGE
+; EFFICIENT LOWERING USING BRKB
-define i32 @ctz_nxv8i1_no_range(<vscale x 8 x i1> %a) {
-; CHECK-LABEL: ctz_nxv8i1_no_range:
+define i32 @ctz_nxv2i1(<vscale x 2 x i1> %a) {
+; CHECK-LABEL: ctz_nxv2i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: index z0.s, #0, #-1
-; CHECK-NEXT: cntw x8
-; CHECK-NEXT: punpklo p1.h, p0.b
-; CHECK-NEXT: neg x8, x8
-; CHECK-NEXT: punpkhi p0.h, p0.b
-; CHECK-NEXT: cnth x9
-; CHECK-NEXT: mov z1.s, w8
-; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: incw z0.s, all, mul #2
-; CHECK-NEXT: add z1.s, z0.s, z1.s
-; CHECK-NEXT: and z0.d, z0.d, z2.d
-; CHECK-NEXT: and z1.d, z1.d, z3.d
-; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: umaxv s0, p0, z0.s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: sub w0, w9, w8
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x0, p0, p0.d
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> %a, i1 0)
+ ret i32 %res
+}
+
+define i32 @ctz_nxv2i1_poison(<vscale x 2 x i1> %a) {
+; CHECK-LABEL: ctz_nxv2i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x0, p0, p0.d
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> %a, i1 1)
+ ret i32 %res
+}
+
+define i64 @add_i64_ctz_nxv2i1_poison(<vscale x 2 x i1> %a, i64 %b) {
+; CHECK-LABEL: add_i64_ctz_nxv2i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: incp x0, p0.d
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> %a, i1 1)
+ %add = add i64 %res, %b
+ ret i64 %add
+}
+
+define i32 @add_i32_ctz_nxv2i1_poison(<vscale x 2 x i1> %a, i32 %b) {
+; CHECK-LABEL: add_i32_ctz_nxv2i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: incp x0, p0.d
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> %a, i1 1)
+ %trunc = trunc i64 %res to i32
+ %add = add i32 %trunc, %b
+ ret i32 %add
+}
+
+define i32 @ctz_nxv4i1(<vscale x 4 x i1> %a) {
+; CHECK-LABEL: ctz_nxv4i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x0, p0, p0.s
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> %a, i1 0)
+ ret i32 %res
+}
+
+define i32 @ctz_nxv4i1_poison(<vscale x 4 x i1> %a) {
+; CHECK-LABEL: ctz_nxv4i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x0, p0, p0.s
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> %a, i1 1)
+ ret i32 %res
+}
+
+define i64 @add_i64_ctz_nxv4i1_poison(<vscale x 4 x i1> %a, i64 %b) {
+; CHECK-LABEL: add_i64_ctz_nxv4i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: incp x0, p0.s
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> %a, i1 1)
+ %add = add i64 %res, %b
+ ret i64 %add
+}
+
+define i32 @add_i32_ctz_nxv4i1_poison(<vscale x 4 x i1> %a, i32 %b) {
+; CHECK-LABEL: add_i32_ctz_nxv4i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: incp x0, p0.s
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> %a, i1 1)
+ %trunc = trunc i64 %res to i32
+ %add = add i32 %trunc, %b
+ ret i32 %add
+}
+
+define i32 @ctz_nxv8i1(<vscale x 8 x i1> %a) {
+; CHECK-LABEL: ctz_nxv8i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x0, p0, p0.h
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: ret
%res = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> %a, i1 0)
ret i32 %res
}
-; MATCH WITH BRKB + CNTP
+define i32 @ctz_nxv8i1_poison(<vscale x 8 x i1> %a) {
+; CHECK-LABEL: ctz_nxv8i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x0, p0, p0.h
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> %a, i1 1)
+ ret i32 %res
+}
+
+define i64 @add_i64_ctz_nxv8i1_poison(<vscale x 8 x i1> %a, i64 %b) {
+; CHECK-LABEL: add_i64_ctz_nxv8i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: incp x0, p0.h
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> %a, i1 1)
+ %add = add i64 %res, %b
+ ret i64 %add
+}
-define i32 @ctz_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
+define i32 @add_i32_ctz_nxv8i1_poison(<vscale x 8 x i1> %a, i32 %b) {
+; CHECK-LABEL: add_i32_ctz_nxv8i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: incp x0, p0.h
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> %a, i1 1)
+ %trunc = trunc i64 %res to i32
+ %add = add i32 %trunc, %b
+ ret i32 %add
+}
+
+define i32 @ctz_nxv16i1(<vscale x 16 x i1> %a) {
; CHECK-LABEL: ctz_nxv16i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.b
-; CHECK-NEXT: brkb p0.b, p0/z, p1.b
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
; CHECK-NEXT: cntp x0, p0, p0.b
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: ret
@@ -198,11 +304,11 @@ define i32 @ctz_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
ret i32 %res
}
-define i32 @ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
+define i32 @ctz_nxv16i1_poison(<vscale x 16 x i1> %a) {
; CHECK-LABEL: ctz_nxv16i1_poison:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.b
-; CHECK-NEXT: brkb p0.b, p0/z, p1.b
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
; CHECK-NEXT: cntp x0, p0, p0.b
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: ret
@@ -226,11 +332,11 @@ define i32 @ctz_and_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vsca
ret i32 %res
}
-define i64 @add_i64_ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a, i64 %b) {
+define i64 @add_i64_ctz_nxv16i1_poison(<vscale x 16 x i1> %a, i64 %b) {
; CHECK-LABEL: add_i64_ctz_nxv16i1_poison:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.b
-; CHECK-NEXT: brkb p0.b, p0/z, p1.b
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
; CHECK-NEXT: incp x0, p0.b
; CHECK-NEXT: ret
%res = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> %a, i1 1)
@@ -238,12 +344,12 @@ define i64 @add_i64_ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1
ret i64 %add
}
-define i32 @add_i32_ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a, i32 %b) {
+define i32 @add_i32_ctz_nxv16i1_poison(<vscale x 16 x i1> %a, i32 %b) {
; CHECK-LABEL: add_i32_ctz_nxv16i1_poison:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT: brkb p0.b, p0/z, p1.b
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
; CHECK-NEXT: incp x0, p0.b
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: ret
More information about the llvm-commits
mailing list