[llvm] [AArch64] Improve code generation for experimental.cttz.elts (PR #91505)

via llvm-commits llvm-commits at lists.llvm.org
Wed May 8 10:14:20 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-aarch64

Author: Hari Limaye (hazzlim)

<details>
<summary>Changes</summary>

This patch extends support for lowering the experimental.cttz.elts intrinsic to BRKB + CNTP instruction sequences, using this lowering for all legal predicate types. An unused parameter is also removed from some of the related regression tests.

---
Full diff: https://github.com/llvm/llvm-project/pull/91505.diff


3 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+10-1) 
- (modified) llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td (+36) 
- (modified) llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll (+159-53) 


``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c1ca78af5cda8..6aea50c5f35cb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1852,7 +1852,16 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
 }
 
 bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
-  return !Subtarget->hasSVEorSME() || VT != MVT::nxv16i1;
+  // Only SVE and SME architectures support BRKB and CNTP instructions.
+  if (!Subtarget->hasSVEorSME())
+    return true;
+
+  // We can only use the BRKB + CNTP sequence with legal predicate types.
+  if (VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
+      VT != MVT::nxv2i1)
+    return true;
+
+  return false;
 }
 
 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 62e68de1359f7..9fe88b0d7a92d 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2075,6 +2075,18 @@ let Predicates = [HasSVEorSME] in {
   def : Pat<(i64 (AArch64CttzElts nxv16i1:$Op1)),
             (CNTP_XPP_B (BRKB_PPzP (PTRUE_B 31), PPR:$Op1),
                         (BRKB_PPzP (PTRUE_B 31), PPR:$Op1))>;
+
+  def : Pat<(i64 (AArch64CttzElts nxv8i1:$Op1)),
+            (CNTP_XPP_H (BRKB_PPzP (PTRUE_H 31), PPR:$Op1),
+                        (BRKB_PPzP (PTRUE_H 31), PPR:$Op1))>;
+
+  def : Pat<(i64 (AArch64CttzElts nxv4i1:$Op1)),
+            (CNTP_XPP_S (BRKB_PPzP (PTRUE_S 31), PPR:$Op1),
+                        (BRKB_PPzP (PTRUE_S 31), PPR:$Op1))>;
+
+  def : Pat<(i64 (AArch64CttzElts nxv2i1:$Op1)),
+            (CNTP_XPP_D (BRKB_PPzP (PTRUE_D 31), PPR:$Op1),
+                        (BRKB_PPzP (PTRUE_D 31), PPR:$Op1))>;
 }
 
   defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb", add, int_aarch64_sve_cntb>;
@@ -2168,6 +2180,30 @@ let Predicates = [HasSVEorSME] in {
                                        (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Op1, sub_32)),
                             sub_32)>;
 
+  def : Pat<(i64 (add GPR64:$Op1, (i64 (AArch64CttzElts nxv8i1:$Op2)))),
+            (INCP_XP_H (BRKB_PPzP (PTRUE_H 31), PPR:$Op2), GPR64:$Op1)>;
+
+  def : Pat<(i32 (add GPR32:$Op1, (trunc (i64 (AArch64CttzElts nxv8i1:$Op2))))),
+            (EXTRACT_SUBREG (INCP_XP_H (BRKB_PPzP (PTRUE_H 31), PPR:$Op2),
+                                       (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Op1, sub_32)),
+                            sub_32)>;
+
+  def : Pat<(i64 (add GPR64:$Op1, (i64 (AArch64CttzElts nxv4i1:$Op2)))),
+            (INCP_XP_S (BRKB_PPzP (PTRUE_S 31), PPR:$Op2), GPR64:$Op1)>;
+
+  def : Pat<(i32 (add GPR32:$Op1, (trunc (i64 (AArch64CttzElts nxv4i1:$Op2))))),
+            (EXTRACT_SUBREG (INCP_XP_S (BRKB_PPzP (PTRUE_S 31), PPR:$Op2),
+                                       (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Op1, sub_32)),
+                            sub_32)>;
+
+  def : Pat<(i64 (add GPR64:$Op1, (i64 (AArch64CttzElts nxv2i1:$Op2)))),
+            (INCP_XP_D (BRKB_PPzP (PTRUE_D 31), PPR:$Op2), GPR64:$Op1)>;
+
+  def : Pat<(i32 (add GPR32:$Op1, (trunc (i64 (AArch64CttzElts nxv2i1:$Op2))))),
+            (EXTRACT_SUBREG (INCP_XP_D (BRKB_PPzP (PTRUE_D 31), PPR:$Op2),
+                                       (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Op1, sub_32)),
+                            sub_32)>;
+
   defm INDEX_RR : sve_int_index_rr<"index", AArch64mul_p_oneuse>;
   defm INDEX_IR : sve_int_index_ir<"index", AArch64mul_p, AArch64mul_p_oneuse>;
   defm INDEX_RI : sve_int_index_ri<"index">;
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
index 9bd2ed240810d..fe7a2ba1240dc 100644
--- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
+++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
@@ -4,25 +4,6 @@
 
 ; WITH VSCALE RANGE
 
-define i64 @ctz_nxv8i1(<vscale x 8 x i1> %a) #0 {
-; CHECK-LABEL: ctz_nxv8i1:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    index z0.h, #0, #-1
-; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    cnth x9
-; CHECK-NEXT:    inch z0.h
-; CHECK-NEXT:    and z0.d, z0.d, z1.d
-; CHECK-NEXT:    and z0.h, z0.h, #0xff
-; CHECK-NEXT:    umaxv h0, p0, z0.h
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    sub w8, w9, w8
-; CHECK-NEXT:    and x0, x8, #0xff
-; CHECK-NEXT:    ret
-  %res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> %a, i1 0)
-  ret i64 %res
-}
-
 define i32 @ctz_nxv32i1(<vscale x 32 x i1> %a) #0 {
 ; CHECK-LABEL: ctz_nxv32i1:
 ; CHECK:       // %bb.0:
@@ -156,41 +137,166 @@ define i64 @vscale_4096_poison(<vscale x 16 x i8> %a) #1 {
   ret i64 %res
 }
 
-; NO VSCALE RANGE
+; MATCH WITH BRKB + CNTP
 
-define i32 @ctz_nxv8i1_no_range(<vscale x 8 x i1> %a) {
-; CHECK-LABEL: ctz_nxv8i1_no_range:
+define i32 @ctz_nxv2i1(<vscale x 2 x i1> %a) {
+; CHECK-LABEL: ctz_nxv2i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    index z0.s, #0, #-1
-; CHECK-NEXT:    cntw x8
-; CHECK-NEXT:    punpklo p1.h, p0.b
-; CHECK-NEXT:    neg x8, x8
-; CHECK-NEXT:    punpkhi p0.h, p0.b
-; CHECK-NEXT:    cnth x9
-; CHECK-NEXT:    mov z1.s, w8
-; CHECK-NEXT:    mov z2.s, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    mov z3.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    incw z0.s, all, mul #2
-; CHECK-NEXT:    add z1.s, z0.s, z1.s
-; CHECK-NEXT:    and z0.d, z0.d, z2.d
-; CHECK-NEXT:    and z1.d, z1.d, z3.d
-; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    umaxv s0, p0, z0.s
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    sub w0, w9, w8
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.d
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> %a, i1 0)
+  ret i32 %res
+}
+
+define i32 @ctz_nxv2i1_poison(<vscale x 2 x i1> %a) {
+; CHECK-LABEL: ctz_nxv2i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.d
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> %a, i1 1)
+  ret i32 %res
+}
+
+define i64 @add_i64_ctz_nxv2i1_poison(<vscale x 2 x i1> %a, i64 %b) {
+; CHECK-LABEL: add_i64_ctz_nxv2i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    incp x0, p0.d
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> %a, i1 1)
+  %add = add i64 %res, %b
+  ret i64 %add
+}
+
+define i32 @add_i32_ctz_nxv2i1_poison(<vscale x 2 x i1> %a, i32 %b) {
+; CHECK-LABEL: add_i32_ctz_nxv2i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    incp x0, p0.d
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> %a, i1 1)
+  %trunc = trunc i64 %res to i32
+  %add = add i32 %trunc, %b
+  ret i32 %add
+}
+
+define i32 @ctz_nxv4i1(<vscale x 4 x i1> %a) {
+; CHECK-LABEL: ctz_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.s
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> %a, i1 0)
+  ret i32 %res
+}
+
+define i32 @ctz_nxv4i1_poison(<vscale x 4 x i1> %a) {
+; CHECK-LABEL: ctz_nxv4i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.s
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> %a, i1 1)
+  ret i32 %res
+}
+
+define i64 @add_i64_ctz_nxv4i1_poison(<vscale x 4 x i1> %a, i64 %b) {
+; CHECK-LABEL: add_i64_ctz_nxv4i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    incp x0, p0.s
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> %a, i1 1)
+  %add = add i64 %res, %b
+  ret i64 %add
+}
+
+define i32 @add_i32_ctz_nxv4i1_poison(<vscale x 4 x i1> %a, i32 %b) {
+; CHECK-LABEL: add_i32_ctz_nxv4i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    incp x0, p0.s
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> %a, i1 1)
+  %trunc = trunc i64 %res to i32
+  %add = add i32 %trunc, %b
+  ret i32 %add
+}
+
+define i32 @ctz_nxv8i1(<vscale x 8 x i1> %a) {
+; CHECK-LABEL: ctz_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.h
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
   %res = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> %a, i1 0)
   ret i32 %res
 }
 
-; MATCH WITH BRKB + CNTP
+define i32 @ctz_nxv8i1_poison(<vscale x 8 x i1> %a) {
+; CHECK-LABEL: ctz_nxv8i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.h
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> %a, i1 1)
+  ret i32 %res
+}
+
+define i64 @add_i64_ctz_nxv8i1_poison(<vscale x 8 x i1> %a, i64 %b) {
+; CHECK-LABEL: add_i64_ctz_nxv8i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    incp x0, p0.h
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> %a, i1 1)
+  %add = add i64 %res, %b
+  ret i64 %add
+}
 
-define i32 @ctz_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
+define i32 @add_i32_ctz_nxv8i1_poison(<vscale x 8 x i1> %a, i32 %b) {
+; CHECK-LABEL: add_i32_ctz_nxv8i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    incp x0, p0.h
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> %a, i1 1)
+  %trunc = trunc i64 %res to i32
+  %add = add i32 %trunc, %b
+  ret i32 %add
+}
+
+define i32 @ctz_nxv16i1(<vscale x 16 x i1> %a) {
 ; CHECK-LABEL: ctz_nxv16i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    brkb p0.b, p0/z, p1.b
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
 ; CHECK-NEXT:    cntp x0, p0, p0.b
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
@@ -198,11 +304,11 @@ define i32 @ctz_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
   ret i32 %res
 }
 
-define i32 @ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
+define i32 @ctz_nxv16i1_poison(<vscale x 16 x i1> %a) {
 ; CHECK-LABEL: ctz_nxv16i1_poison:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    brkb p0.b, p0/z, p1.b
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
 ; CHECK-NEXT:    cntp x0, p0, p0.b
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
@@ -226,11 +332,11 @@ define i32 @ctz_and_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vsca
   ret i32 %res
 }
 
-define i64 @add_i64_ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a, i64 %b) {
+define i64 @add_i64_ctz_nxv16i1_poison(<vscale x 16 x i1> %a, i64 %b) {
 ; CHECK-LABEL: add_i64_ctz_nxv16i1_poison:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    brkb p0.b, p0/z, p1.b
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
 ; CHECK-NEXT:    incp x0, p0.b
 ; CHECK-NEXT:    ret
   %res = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> %a, i1 1)
@@ -238,12 +344,12 @@ define i64 @add_i64_ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1
   ret i64 %add
 }
 
-define i32 @add_i32_ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a, i32 %b) {
+define i32 @add_i32_ctz_nxv16i1_poison(<vscale x 16 x i1> %a, i32 %b) {
 ; CHECK-LABEL: add_i32_ctz_nxv16i1_poison:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    brkb p0.b, p0/z, p1.b
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
 ; CHECK-NEXT:    incp x0, p0.b
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret

``````````

</details>


https://github.com/llvm/llvm-project/pull/91505


More information about the llvm-commits mailing list