[llvm] [AArch64] Improve code generation for experimental.cttz.elts (PR #91505)
via llvm-commits
llvm-commits at lists.llvm.org
Wed May 8 10:14:20 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Hari Limaye (hazzlim)
<details>
<summary>Changes</summary>
This patch extends support for lowering the experimental.cttz.elts intrinsic to BRKB + CNTP instruction sequences, using this lowering for all legal predicate types. An unused parameter is also removed from some of the related regression tests.
---
Full diff: https://github.com/llvm/llvm-project/pull/91505.diff
3 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+10-1)
- (modified) llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td (+36)
- (modified) llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll (+159-53)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c1ca78af5cda8..6aea50c5f35cb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1852,7 +1852,16 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
}
bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
- return !Subtarget->hasSVEorSME() || VT != MVT::nxv16i1;
+ // Only SVE and SME architectures support BRKB and CNTP instructions.
+ if (!Subtarget->hasSVEorSME())
+ return true;
+
+ // We can only use the BRKB + CNTP sequence with legal predicate types.
+ if (VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
+ VT != MVT::nxv2i1)
+ return true;
+
+ return false;
}
void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 62e68de1359f7..9fe88b0d7a92d 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2075,6 +2075,18 @@ let Predicates = [HasSVEorSME] in {
def : Pat<(i64 (AArch64CttzElts nxv16i1:$Op1)),
(CNTP_XPP_B (BRKB_PPzP (PTRUE_B 31), PPR:$Op1),
(BRKB_PPzP (PTRUE_B 31), PPR:$Op1))>;
+
+ def : Pat<(i64 (AArch64CttzElts nxv8i1:$Op1)),
+ (CNTP_XPP_H (BRKB_PPzP (PTRUE_H 31), PPR:$Op1),
+ (BRKB_PPzP (PTRUE_H 31), PPR:$Op1))>;
+
+ def : Pat<(i64 (AArch64CttzElts nxv4i1:$Op1)),
+ (CNTP_XPP_S (BRKB_PPzP (PTRUE_S 31), PPR:$Op1),
+ (BRKB_PPzP (PTRUE_S 31), PPR:$Op1))>;
+
+ def : Pat<(i64 (AArch64CttzElts nxv2i1:$Op1)),
+ (CNTP_XPP_D (BRKB_PPzP (PTRUE_D 31), PPR:$Op1),
+ (BRKB_PPzP (PTRUE_D 31), PPR:$Op1))>;
}
defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb", add, int_aarch64_sve_cntb>;
@@ -2168,6 +2180,30 @@ let Predicates = [HasSVEorSME] in {
(INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Op1, sub_32)),
sub_32)>;
+ def : Pat<(i64 (add GPR64:$Op1, (i64 (AArch64CttzElts nxv8i1:$Op2)))),
+ (INCP_XP_H (BRKB_PPzP (PTRUE_H 31), PPR:$Op2), GPR64:$Op1)>;
+
+ def : Pat<(i32 (add GPR32:$Op1, (trunc (i64 (AArch64CttzElts nxv8i1:$Op2))))),
+ (EXTRACT_SUBREG (INCP_XP_H (BRKB_PPzP (PTRUE_H 31), PPR:$Op2),
+ (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Op1, sub_32)),
+ sub_32)>;
+
+ def : Pat<(i64 (add GPR64:$Op1, (i64 (AArch64CttzElts nxv4i1:$Op2)))),
+ (INCP_XP_S (BRKB_PPzP (PTRUE_S 31), PPR:$Op2), GPR64:$Op1)>;
+
+ def : Pat<(i32 (add GPR32:$Op1, (trunc (i64 (AArch64CttzElts nxv4i1:$Op2))))),
+ (EXTRACT_SUBREG (INCP_XP_S (BRKB_PPzP (PTRUE_S 31), PPR:$Op2),
+ (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Op1, sub_32)),
+ sub_32)>;
+
+ def : Pat<(i64 (add GPR64:$Op1, (i64 (AArch64CttzElts nxv2i1:$Op2)))),
+ (INCP_XP_D (BRKB_PPzP (PTRUE_D 31), PPR:$Op2), GPR64:$Op1)>;
+
+ def : Pat<(i32 (add GPR32:$Op1, (trunc (i64 (AArch64CttzElts nxv2i1:$Op2))))),
+ (EXTRACT_SUBREG (INCP_XP_D (BRKB_PPzP (PTRUE_D 31), PPR:$Op2),
+ (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Op1, sub_32)),
+ sub_32)>;
+
defm INDEX_RR : sve_int_index_rr<"index", AArch64mul_p_oneuse>;
defm INDEX_IR : sve_int_index_ir<"index", AArch64mul_p, AArch64mul_p_oneuse>;
defm INDEX_RI : sve_int_index_ri<"index">;
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
index 9bd2ed240810d..fe7a2ba1240dc 100644
--- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
+++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
@@ -4,25 +4,6 @@
; WITH VSCALE RANGE
-define i64 @ctz_nxv8i1(<vscale x 8 x i1> %a) #0 {
-; CHECK-LABEL: ctz_nxv8i1:
-; CHECK: // %bb.0:
-; CHECK-NEXT: index z0.h, #0, #-1
-; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: cnth x9
-; CHECK-NEXT: inch z0.h
-; CHECK-NEXT: and z0.d, z0.d, z1.d
-; CHECK-NEXT: and z0.h, z0.h, #0xff
-; CHECK-NEXT: umaxv h0, p0, z0.h
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: sub w8, w9, w8
-; CHECK-NEXT: and x0, x8, #0xff
-; CHECK-NEXT: ret
- %res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> %a, i1 0)
- ret i64 %res
-}
-
define i32 @ctz_nxv32i1(<vscale x 32 x i1> %a) #0 {
; CHECK-LABEL: ctz_nxv32i1:
; CHECK: // %bb.0:
@@ -156,41 +137,166 @@ define i64 @vscale_4096_poison(<vscale x 16 x i8> %a) #1 {
ret i64 %res
}
-; NO VSCALE RANGE
+; MATCH WITH BRKB + CNTP
-define i32 @ctz_nxv8i1_no_range(<vscale x 8 x i1> %a) {
-; CHECK-LABEL: ctz_nxv8i1_no_range:
+define i32 @ctz_nxv2i1(<vscale x 2 x i1> %a) {
+; CHECK-LABEL: ctz_nxv2i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: index z0.s, #0, #-1
-; CHECK-NEXT: cntw x8
-; CHECK-NEXT: punpklo p1.h, p0.b
-; CHECK-NEXT: neg x8, x8
-; CHECK-NEXT: punpkhi p0.h, p0.b
-; CHECK-NEXT: cnth x9
-; CHECK-NEXT: mov z1.s, w8
-; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: incw z0.s, all, mul #2
-; CHECK-NEXT: add z1.s, z0.s, z1.s
-; CHECK-NEXT: and z0.d, z0.d, z2.d
-; CHECK-NEXT: and z1.d, z1.d, z3.d
-; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: umaxv s0, p0, z0.s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: sub w0, w9, w8
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x0, p0, p0.d
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> %a, i1 0)
+ ret i32 %res
+}
+
+define i32 @ctz_nxv2i1_poison(<vscale x 2 x i1> %a) {
+; CHECK-LABEL: ctz_nxv2i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x0, p0, p0.d
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> %a, i1 1)
+ ret i32 %res
+}
+
+define i64 @add_i64_ctz_nxv2i1_poison(<vscale x 2 x i1> %a, i64 %b) {
+; CHECK-LABEL: add_i64_ctz_nxv2i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: incp x0, p0.d
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> %a, i1 1)
+ %add = add i64 %res, %b
+ ret i64 %add
+}
+
+define i32 @add_i32_ctz_nxv2i1_poison(<vscale x 2 x i1> %a, i32 %b) {
+; CHECK-LABEL: add_i32_ctz_nxv2i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: incp x0, p0.d
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> %a, i1 1)
+ %trunc = trunc i64 %res to i32
+ %add = add i32 %trunc, %b
+ ret i32 %add
+}
+
+define i32 @ctz_nxv4i1(<vscale x 4 x i1> %a) {
+; CHECK-LABEL: ctz_nxv4i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x0, p0, p0.s
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> %a, i1 0)
+ ret i32 %res
+}
+
+define i32 @ctz_nxv4i1_poison(<vscale x 4 x i1> %a) {
+; CHECK-LABEL: ctz_nxv4i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x0, p0, p0.s
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> %a, i1 1)
+ ret i32 %res
+}
+
+define i64 @add_i64_ctz_nxv4i1_poison(<vscale x 4 x i1> %a, i64 %b) {
+; CHECK-LABEL: add_i64_ctz_nxv4i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: incp x0, p0.s
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> %a, i1 1)
+ %add = add i64 %res, %b
+ ret i64 %add
+}
+
+define i32 @add_i32_ctz_nxv4i1_poison(<vscale x 4 x i1> %a, i32 %b) {
+; CHECK-LABEL: add_i32_ctz_nxv4i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: incp x0, p0.s
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> %a, i1 1)
+ %trunc = trunc i64 %res to i32
+ %add = add i32 %trunc, %b
+ ret i32 %add
+}
+
+define i32 @ctz_nxv8i1(<vscale x 8 x i1> %a) {
+; CHECK-LABEL: ctz_nxv8i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x0, p0, p0.h
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: ret
%res = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> %a, i1 0)
ret i32 %res
}
-; MATCH WITH BRKB + CNTP
+define i32 @ctz_nxv8i1_poison(<vscale x 8 x i1> %a) {
+; CHECK-LABEL: ctz_nxv8i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x0, p0, p0.h
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> %a, i1 1)
+ ret i32 %res
+}
+
+define i64 @add_i64_ctz_nxv8i1_poison(<vscale x 8 x i1> %a, i64 %b) {
+; CHECK-LABEL: add_i64_ctz_nxv8i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: incp x0, p0.h
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> %a, i1 1)
+ %add = add i64 %res, %b
+ ret i64 %add
+}
-define i32 @ctz_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
+define i32 @add_i32_ctz_nxv8i1_poison(<vscale x 8 x i1> %a, i32 %b) {
+; CHECK-LABEL: add_i32_ctz_nxv8i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: incp x0, p0.h
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> %a, i1 1)
+ %trunc = trunc i64 %res to i32
+ %add = add i32 %trunc, %b
+ ret i32 %add
+}
+
+define i32 @ctz_nxv16i1(<vscale x 16 x i1> %a) {
; CHECK-LABEL: ctz_nxv16i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.b
-; CHECK-NEXT: brkb p0.b, p0/z, p1.b
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
; CHECK-NEXT: cntp x0, p0, p0.b
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: ret
@@ -198,11 +304,11 @@ define i32 @ctz_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
ret i32 %res
}
-define i32 @ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
+define i32 @ctz_nxv16i1_poison(<vscale x 16 x i1> %a) {
; CHECK-LABEL: ctz_nxv16i1_poison:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.b
-; CHECK-NEXT: brkb p0.b, p0/z, p1.b
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
; CHECK-NEXT: cntp x0, p0, p0.b
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: ret
@@ -226,11 +332,11 @@ define i32 @ctz_and_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vsca
ret i32 %res
}
-define i64 @add_i64_ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a, i64 %b) {
+define i64 @add_i64_ctz_nxv16i1_poison(<vscale x 16 x i1> %a, i64 %b) {
; CHECK-LABEL: add_i64_ctz_nxv16i1_poison:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.b
-; CHECK-NEXT: brkb p0.b, p0/z, p1.b
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
; CHECK-NEXT: incp x0, p0.b
; CHECK-NEXT: ret
%res = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> %a, i1 1)
@@ -238,12 +344,12 @@ define i64 @add_i64_ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1
ret i64 %add
}
-define i32 @add_i32_ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a, i32 %b) {
+define i32 @add_i32_ctz_nxv16i1_poison(<vscale x 16 x i1> %a, i32 %b) {
; CHECK-LABEL: add_i32_ctz_nxv16i1_poison:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT: brkb p0.b, p0/z, p1.b
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
; CHECK-NEXT: incp x0, p0.b
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: ret
``````````
</details>
https://github.com/llvm/llvm-project/pull/91505
More information about the llvm-commits
mailing list