[llvm] 12a7413 - [AArch64][SVE] Combine cntp intrinsics with add/sub to produce incp/decp

Fri May 14 09:16:31 PDT 2021

Author: Bradley Smith
Date: 2021-05-14T17:16:06+01:00
New Revision: 12a74137b3c48bb7e47ce39a8bc76c97c3f115be

URL: https://github.com/llvm/llvm-project/commit/12a74137b3c48bb7e47ce39a8bc76c97c3f115be
DIFF: https://github.com/llvm/llvm-project/commit/12a74137b3c48bb7e47ce39a8bc76c97c3f115be.diff

LOG: [AArch64][SVE] Combine cntp intrinsics with add/sub to produce incp/decp

Depends on D101062

Differential Revision: https://reviews.llvm.org/D102077

Added: 
    llvm/test/CodeGen/AArch64/sve-cntp-combine.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
    llvm/lib/Target/AArch64/SVEInstrFormats.td

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index dfe015ebf945..58317f977940 100644

--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1365,8 +1365,8 @@ let Predicates = [HasSVE] in {
   defm SQDECP_XP   : sve_int_count_r_x64<0b01010, "sqdecp", int_aarch64_sve_sqdecp_n64>;
   defm UQDECP_WP   : sve_int_count_r_u32<0b01100, "uqdecp", int_aarch64_sve_uqdecp_n32>;
   defm UQDECP_XP   : sve_int_count_r_x64<0b01110, "uqdecp", int_aarch64_sve_uqdecp_n64>;
-  defm INCP_XP     : sve_int_count_r_x64<0b10000, "incp">;
-  defm DECP_XP     : sve_int_count_r_x64<0b10100, "decp">;
+  defm INCP_XP     : sve_int_count_r_x64<0b10000, "incp", null_frag, add>;
+  defm DECP_XP     : sve_int_count_r_x64<0b10100, "decp", null_frag, sub>;
 
   defm SQINCP_ZP   : sve_int_count_v<0b00000, "sqincp", int_aarch64_sve_sqincp>;
   defm UQINCP_ZP   : sve_int_count_v<0b00100, "uqincp", int_aarch64_sve_uqincp>;

diff  --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 210c2d254564..d0523c44cc49 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -263,6 +263,11 @@ def sve_incdec_imm : Operand<i32>, TImmLeaf<i32, [{
 def sve_cnt_mul_imm : ComplexPattern<i32, 1, "SelectCntImm<1, 16, 1, false>">;
 def sve_cnt_shl_imm : ComplexPattern<i32, 1, "SelectCntImm<1, 16, 1, true>">;
 
+def int_aarch64_sve_cntp_oneuse : PatFrag<(ops node:$pred, node:$src2),
+                                          (int_aarch64_sve_cntp node:$pred, node:$src2), [{
+  return N->hasOneUse();
+}]>;
+
 //===----------------------------------------------------------------------===//
 // SVE PTrue - These are used extensively throughout the pattern matching so
 //             it's important we define them first.
@@ -664,7 +669,8 @@ multiclass sve_int_count_r_u32<bits<5> opc, string asm,
 }
 
 multiclass sve_int_count_r_x64<bits<5> opc, string asm,
-                               SDPatternOperator op = null_frag> {
+                               SDPatternOperator op,
+                               SDPatternOperator combine_op = null_frag> {
   def _B : sve_int_count_r<0b00, opc, asm, GPR64z, PPR8, GPR64z>;
   def _H : sve_int_count_r<0b01, opc, asm, GPR64z, PPR16, GPR64z>;
   def _S : sve_int_count_r<0b10, opc, asm, GPR64z, PPR32, GPR64z>;
@@ -678,6 +684,16 @@ multiclass sve_int_count_r_x64<bits<5> opc, string asm,
             (!cast<Instruction>(NAME # _S) PPRAny:$Pg, $Rn)>;
   def : Pat<(i64 (op GPR64:$Rn, (nxv2i1 PPRAny:$Pg))),
             (!cast<Instruction>(NAME # _D) PPRAny:$Pg, $Rn)>;
+
+  // Combine cntp with combine_op
+  def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv16i1 (SVEAllActive)), (nxv16i1 PPRAny:$pred)))),
+            (!cast<Instruction>(NAME # _B) PPRAny:$pred, $Rn)>;
+  def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv8i1 (SVEAllActive)), (nxv8i1 PPRAny:$pred)))),
+            (!cast<Instruction>(NAME # _H) PPRAny:$pred, $Rn)>;
+  def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv4i1 (SVEAllActive)), (nxv4i1 PPRAny:$pred)))),
+            (!cast<Instruction>(NAME # _S) PPRAny:$pred, $Rn)>;
+  def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv2i1 (SVEAllActive)), (nxv2i1 PPRAny:$pred)))),
+            (!cast<Instruction>(NAME # _D) PPRAny:$pred, $Rn)>;
 }
 
 class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,

diff  --git a/llvm/test/CodeGen/AArch64/sve-cntp-combine.ll b/llvm/test/CodeGen/AArch64/sve-cntp-combine.ll
new file mode 100644
index 000000000000..3539264662eb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-cntp-combine.ll
@@ -0,0 +1,169 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; INCP
+
+define i64 @cntp_add_nxv16i1(i64 %x, <vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: cntp_add_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    incp x0, p0.b
+; CHECK-NEXT:    ret
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %2 = tail call i64 @llvm.aarch64.sve.cntp.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %pg)
+  %add = add i64 %2, %x
+  ret i64 %add
+}
+
+define i64 @cntp_add_nxv8i1(i64 %x, <vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: cntp_add_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    incp x0, p0.h
+; CHECK-NEXT:    ret
+  %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %2 = tail call i64 @llvm.aarch64.sve.cntp.nxv8i1(<vscale x 8 x i1> %1, <vscale x 8 x i1> %pg)
+  %add = add i64 %2, %x
+  ret i64 %add
+}
+
+define i64 @cntp_add_nxv4i1(i64 %x, <vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: cntp_add_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    incp x0, p0.s
+; CHECK-NEXT:    ret
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %2 = tail call i64 @llvm.aarch64.sve.cntp.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %pg)
+  %add = add i64 %2, %x
+  ret i64 %add
+}
+
+define i64 @cntp_add_nxv2i1(i64 %x, <vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: cntp_add_nxv2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    incp x0, p0.d
+; CHECK-NEXT:    ret
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %2 = tail call i64 @llvm.aarch64.sve.cntp.nxv2i1(<vscale x 2 x i1> %1, <vscale x 2 x i1> %pg)
+  %add = add i64 %2, %x
+  ret i64 %add
+}
+
+define i64 @cntp_add_all_active_nxv8i1(i64 %x, <vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: cntp_add_all_active_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    incp x0, p0.h
+; CHECK-NEXT:    ret
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %1)
+  %3 = tail call i64 @llvm.aarch64.sve.cntp.nxv8i1(<vscale x 8 x i1> %2, <vscale x 8 x i1> %pg)
+  %add = add i64 %3, %x
+  ret i64 %add
+}
+
+define i64 @cntp_add_nxv2i1_oneuse(i64 %x, <vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: cntp_add_nxv2i1_oneuse:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    cntp x8, p1, p0.d
+; CHECK-NEXT:    add x9, x8, x0
+; CHECK-NEXT:    madd x0, x8, x0, x9
+; CHECK-NEXT:    ret
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %2 = tail call i64 @llvm.aarch64.sve.cntp.nxv2i1(<vscale x 2 x i1> %1, <vscale x 2 x i1> %pg)
+  %add = add i64 %2, %x
+  %mul = mul i64 %2, %x
+  %res = add i64 %add, %mul
+  ret i64 %res
+}
+
+; DECP
+
+define i64 @cntp_sub_nxv16i1(i64 %x, <vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: cntp_sub_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    decp x0, p0.b
+; CHECK-NEXT:    ret
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %2 = tail call i64 @llvm.aarch64.sve.cntp.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %pg)
+  %sub = sub i64 %x, %2
+  ret i64 %sub
+}
+
+define i64 @cntp_sub_nxv8i1(i64 %x, <vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: cntp_sub_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    decp x0, p0.h
+; CHECK-NEXT:    ret
+  %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %2 = tail call i64 @llvm.aarch64.sve.cntp.nxv8i1(<vscale x 8 x i1> %1, <vscale x 8 x i1> %pg)
+  %sub = sub i64 %x, %2
+  ret i64 %sub
+}
+
+define i64 @cntp_sub_nxv4i1(i64 %x, <vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: cntp_sub_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    decp x0, p0.s
+; CHECK-NEXT:    ret
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %2 = tail call i64 @llvm.aarch64.sve.cntp.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %pg)
+  %sub = sub i64 %x, %2
+  ret i64 %sub
+}
+
+define i64 @cntp_sub_nxv2i1(i64 %x, <vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: cntp_sub_nxv2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    decp x0, p0.d
+; CHECK-NEXT:    ret
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %2 = tail call i64 @llvm.aarch64.sve.cntp.nxv2i1(<vscale x 2 x i1> %1, <vscale x 2 x i1> %pg)
+  %sub = sub i64 %x, %2
+  ret i64 %sub
+}
+
+define i64 @cntp_sub_all_active_nxv8i1(i64 %x, <vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: cntp_sub_all_active_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    decp x0, p0.h
+; CHECK-NEXT:    ret
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %1)
+  %3 = tail call i64 @llvm.aarch64.sve.cntp.nxv8i1(<vscale x 8 x i1> %2, <vscale x 8 x i1> %pg)
+  %sub = sub i64 %x, %3
+  ret i64 %sub
+}
+
+define i64 @cntp_sub_nxv2i1_multiuse(i64 %x, <vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: cntp_sub_nxv2i1_multiuse:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    cntp x8, p1, p0.d
+; CHECK-NEXT:    sub x9, x8, x0
+; CHECK-NEXT:    madd x0, x8, x0, x9
+; CHECK-NEXT:    ret
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %2 = tail call i64 @llvm.aarch64.sve.cntp.nxv2i1(<vscale x 2 x i1> %1, <vscale x 2 x i1> %pg)
+  %add = sub i64 %2, %x
+  %mul = mul i64 %2, %x
+  %res = add i64 %add, %mul
+  ret i64 %res
+}
+
+
+declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>)
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32)
+
+declare i64 @llvm.aarch64.sve.cntp.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
+declare i64 @llvm.aarch64.sve.cntp.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>)
+declare i64 @llvm.aarch64.sve.cntp.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>)
+declare i64 @llvm.aarch64.sve.cntp.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>)
+
+attributes #0 = { "target-features"="+sve" }