[llvm] bcda4c4 - [SVE] By using SEL when orring predicates we forgo the need for a PTRUE.

Mon Jan 31 11:41:31 PST 2022

Author: Paul Walker
Date: 2022-01-31T19:39:23Z
New Revision: bcda4c48c811d44e2f5e6061e09d06d9e46a2c68

URL: https://github.com/llvm/llvm-project/commit/bcda4c48c811d44e2f5e6061e09d06d9e46a2c68
DIFF: https://github.com/llvm/llvm-project/commit/bcda4c48c811d44e2f5e6061e09d06d9e46a2c68.diff

LOG: [SVE] By using SEL when orring predicates we forgo the need for a PTRUE.

Differential Revision: https://reviews.llvm.org/D118463

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
    llvm/lib/Target/AArch64/SVEInstrFormats.td
    llvm/test/CodeGen/AArch64/sve-int-log.ll
    llvm/test/CodeGen/AArch64/sve-smulo-sdnode.ll
    llvm/test/CodeGen/AArch64/sve-split-int-pred-reduce.ll
    llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index ea0a88e5c591..1d162610de9c 100644

--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -740,14 +740,14 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm PFIRST  : sve_int_pfirst<0b00000, "pfirst", int_aarch64_sve_pfirst>;
   defm PNEXT   : sve_int_pnext<0b00110, "pnext", int_aarch64_sve_pnext>;
 
-  defm AND_PPzPP   : sve_int_pred_log_and<0b0000, "and", int_aarch64_sve_and_z, and>;
-  defm BIC_PPzPP   : sve_int_pred_log_and<0b0001, "bic", int_aarch64_sve_bic_z, AArch64bic>;
+  defm AND_PPzPP   : sve_int_pred_log_v2<0b0000, "and", int_aarch64_sve_and_z, and>;
+  defm BIC_PPzPP   : sve_int_pred_log_v2<0b0001, "bic", int_aarch64_sve_bic_z, AArch64bic>;
   defm EOR_PPzPP   : sve_int_pred_log<0b0010, "eor", int_aarch64_sve_eor_z, xor>;
-  defm SEL_PPPP    : sve_int_pred_log<0b0011, "sel", vselect>;
+  defm SEL_PPPP    : sve_int_pred_log_v2<0b0011, "sel", vselect, or>;
   defm ANDS_PPzPP  : sve_int_pred_log<0b0100, "ands", null_frag>;
   defm BICS_PPzPP  : sve_int_pred_log<0b0101, "bics", null_frag>;
   defm EORS_PPzPP  : sve_int_pred_log<0b0110, "eors", null_frag>;
-  defm ORR_PPzPP   : sve_int_pred_log<0b1000, "orr", int_aarch64_sve_orr_z, or>;
+  defm ORR_PPzPP   : sve_int_pred_log<0b1000, "orr", int_aarch64_sve_orr_z>;
   defm ORN_PPzPP   : sve_int_pred_log<0b1001, "orn", int_aarch64_sve_orn_z>;
   defm NOR_PPzPP   : sve_int_pred_log<0b1010, "nor", int_aarch64_sve_nor_z>;
   defm NAND_PPzPP  : sve_int_pred_log<0b1011, "nand", int_aarch64_sve_nand_z>;

diff  --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 9febe7add771..9d4bdbe5d053 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -1641,8 +1641,10 @@ multiclass sve_int_pred_log<bits<4> opc, string asm, SDPatternOperator op,
                                !cast<Instruction>(NAME), PTRUE_D>;
 }
 
-multiclass sve_int_pred_log_and<bits<4> opc, string asm, SDPatternOperator op,
-                                SDPatternOperator op_nopred> :
+// An instance of sve_int_pred_log_and but uses op_nopred's first operand as the
+// general predicate.
+multiclass sve_int_pred_log_v2<bits<4> opc, string asm, SDPatternOperator op,
+                               SDPatternOperator op_nopred> :
   sve_int_pred_log<opc, asm, op> {
   def : Pat<(nxv16i1 (op_nopred nxv16i1:$Op1, nxv16i1:$Op2)),
             (!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>;

diff  --git a/llvm/test/CodeGen/AArch64/sve-int-log.ll b/llvm/test/CodeGen/AArch64/sve-int-log.ll
index c445c98c6106..2c5ebed00b06 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-log.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-log.ll
@@ -217,8 +217,7 @@ define <vscale x 16 x i8> @or_b_zero(<vscale x 16 x i8> %a) {
 define <vscale x 2 x i1> @or_pred_d(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b) {
 ; CHECK-LABEL: or_pred_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p2.d
-; CHECK-NEXT:    orr p0.b, p2/z, p0.b, p1.b
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
 ; CHECK-NEXT:    ret
   %res = or <vscale x 2 x i1> %a, %b
   ret <vscale x 2 x i1> %res
@@ -227,8 +226,7 @@ define <vscale x 2 x i1> @or_pred_d(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b)
 define <vscale x 4 x i1> @or_pred_s(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b) {
 ; CHECK-LABEL: or_pred_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p2.s
-; CHECK-NEXT:    orr p0.b, p2/z, p0.b, p1.b
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
 ; CHECK-NEXT:    ret
   %res = or <vscale x 4 x i1> %a, %b
   ret <vscale x 4 x i1> %res
@@ -237,8 +235,7 @@ define <vscale x 4 x i1> @or_pred_s(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b)
 define <vscale x 8 x i1> @or_pred_h(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b) {
 ; CHECK-LABEL: or_pred_h:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p2.h
-; CHECK-NEXT:    orr p0.b, p2/z, p0.b, p1.b
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
 ; CHECK-NEXT:    ret
   %res = or <vscale x 8 x i1> %a, %b
   ret <vscale x 8 x i1> %res
@@ -247,8 +244,7 @@ define <vscale x 8 x i1> @or_pred_h(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b)
 define <vscale x 16 x i1> @or_pred_b(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b) {
 ; CHECK-LABEL: or_pred_b:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p2.b
-; CHECK-NEXT:    orr p0.b, p2/z, p0.b, p1.b
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
 ; CHECK-NEXT:    ret
   %res = or <vscale x 16 x i1> %a, %b
   ret <vscale x 16 x i1> %res

diff  --git a/llvm/test/CodeGen/AArch64/sve-smulo-sdnode.ll b/llvm/test/CodeGen/AArch64/sve-smulo-sdnode.ll
index 82a3059251f9..f89ec1d5b91f 100644
--- a/llvm/test/CodeGen/AArch64/sve-smulo-sdnode.ll
+++ b/llvm/test/CodeGen/AArch64/sve-smulo-sdnode.ll
@@ -16,8 +16,8 @@ define <vscale x 2 x i8> @smulo_nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %
 ; CHECK-NEXT:    movprfx z3, z0
 ; CHECK-NEXT:    sxtb z3.d, p0/m, z0.d
 ; CHECK-NEXT:    cmpne p1.d, p0/z, z2.d, z1.d
-; CHECK-NEXT:    cmpne p2.d, p0/z, z3.d, z0.d
-; CHECK-NEXT:    orr p0.b, p0/z, p2.b, p1.b
+; CHECK-NEXT:    cmpne p0.d, p0/z, z3.d, z0.d
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 2 x i8>, <vscale x 2 x i1> } @llvm.smul.with.overflow.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y)
@@ -42,8 +42,8 @@ define <vscale x 4 x i8> @smulo_nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8> %
 ; CHECK-NEXT:    movprfx z3, z0
 ; CHECK-NEXT:    sxtb z3.s, p0/m, z0.s
 ; CHECK-NEXT:    cmpne p1.s, p0/z, z2.s, z1.s
-; CHECK-NEXT:    cmpne p2.s, p0/z, z3.s, z0.s
-; CHECK-NEXT:    orr p0.b, p0/z, p2.b, p1.b
+; CHECK-NEXT:    cmpne p0.s, p0/z, z3.s, z0.s
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
 ; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 4 x i8>, <vscale x 4 x i1> } @llvm.smul.with.overflow.nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8> %y)
@@ -68,8 +68,8 @@ define <vscale x 8 x i8> @smulo_nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %
 ; CHECK-NEXT:    movprfx z3, z0
 ; CHECK-NEXT:    sxtb z3.h, p0/m, z0.h
 ; CHECK-NEXT:    cmpne p1.h, p0/z, z2.h, z1.h
-; CHECK-NEXT:    cmpne p2.h, p0/z, z3.h, z0.h
-; CHECK-NEXT:    orr p0.b, p0/z, p2.b, p1.b
+; CHECK-NEXT:    cmpne p0.h, p0/z, z3.h, z0.h
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
 ; CHECK-NEXT:    mov z0.h, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 8 x i8>, <vscale x 8 x i1> } @llvm.smul.with.overflow.nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %y)
@@ -182,8 +182,8 @@ define <vscale x 2 x i16> @smulo_nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i1
 ; CHECK-NEXT:    movprfx z3, z0
 ; CHECK-NEXT:    sxth z3.d, p0/m, z0.d
 ; CHECK-NEXT:    cmpne p1.d, p0/z, z2.d, z1.d
-; CHECK-NEXT:    cmpne p2.d, p0/z, z3.d, z0.d
-; CHECK-NEXT:    orr p0.b, p0/z, p2.b, p1.b
+; CHECK-NEXT:    cmpne p0.d, p0/z, z3.d, z0.d
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 2 x i16>, <vscale x 2 x i1> } @llvm.smul.with.overflow.nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i16> %y)
@@ -208,8 +208,8 @@ define <vscale x 4 x i16> @smulo_nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x i1
 ; CHECK-NEXT:    movprfx z3, z0
 ; CHECK-NEXT:    sxth z3.s, p0/m, z0.s
 ; CHECK-NEXT:    cmpne p1.s, p0/z, z2.s, z1.s
-; CHECK-NEXT:    cmpne p2.s, p0/z, z3.s, z0.s
-; CHECK-NEXT:    orr p0.b, p0/z, p2.b, p1.b
+; CHECK-NEXT:    cmpne p0.s, p0/z, z3.s, z0.s
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
 ; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 4 x i16>, <vscale x 4 x i1> } @llvm.smul.with.overflow.nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y)
@@ -322,8 +322,8 @@ define <vscale x 2 x i32> @smulo_nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i3
 ; CHECK-NEXT:    movprfx z3, z0
 ; CHECK-NEXT:    sxtw z3.d, p0/m, z0.d
 ; CHECK-NEXT:    cmpne p1.d, p0/z, z2.d, z1.d
-; CHECK-NEXT:    cmpne p2.d, p0/z, z3.d, z0.d
-; CHECK-NEXT:    orr p0.b, p0/z, p2.b, p1.b
+; CHECK-NEXT:    cmpne p0.d, p0/z, z3.d, z0.d
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 2 x i32>, <vscale x 2 x i1> } @llvm.smul.with.overflow.nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y)

diff  --git a/llvm/test/CodeGen/AArch64/sve-split-int-pred-reduce.ll b/llvm/test/CodeGen/AArch64/sve-split-int-pred-reduce.ll
index d24c540a1802..e0d235dfd5b4 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-int-pred-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-int-pred-reduce.ll
@@ -44,8 +44,7 @@ define i1 @andv_nxv64i1(<vscale x 64 x i1> %a) {
 define i1 @orv_nxv32i1(<vscale x 32 x i1> %a) {
 ; CHECK-LABEL: orv_nxv32i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p2.b
-; CHECK-NEXT:    orr p0.b, p2/z, p0.b, p1.b
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
 ; CHECK-NEXT:    ptest p0, p0.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
@@ -87,8 +86,7 @@ define i1 @smaxv_nxv32i1(<vscale x 32 x i1> %a) {
 define i1 @sminv_nxv32i1(<vscale x 32 x i1> %a) {
 ; CHECK-LABEL: sminv_nxv32i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p2.b
-; CHECK-NEXT:    orr p0.b, p2/z, p0.b, p1.b
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
 ; CHECK-NEXT:    ptest p0, p0.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
@@ -101,8 +99,7 @@ define i1 @sminv_nxv32i1(<vscale x 32 x i1> %a) {
 define i1 @umaxv_nxv32i1(<vscale x 32 x i1> %a) {
 ; CHECK-LABEL: umaxv_nxv32i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p2.b
-; CHECK-NEXT:    orr p0.b, p2/z, p0.b, p1.b
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
 ; CHECK-NEXT:    ptest p0, p0.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll b/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll
index 31b9b2a9929e..377031ba6b20 100644
--- a/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll
+++ b/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll
@@ -14,8 +14,8 @@ define <vscale x 2 x i8> @umulo_nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %
 ; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    lsr z1.d, z2.d, #8
 ; CHECK-NEXT:    cmpne p1.d, p0/z, z0.d, #0
-; CHECK-NEXT:    cmpne p2.d, p0/z, z1.d, #0
-; CHECK-NEXT:    orr p0.b, p0/z, p2.b, p1.b
+; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
 ; CHECK-NEXT:    mov z2.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
@@ -39,8 +39,8 @@ define <vscale x 4 x i8> @umulo_nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8> %
 ; CHECK-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    lsr z1.s, z2.s, #8
 ; CHECK-NEXT:    cmpne p1.s, p0/z, z0.s, #0
-; CHECK-NEXT:    cmpne p2.s, p0/z, z1.s, #0
-; CHECK-NEXT:    orr p0.b, p0/z, p2.b, p1.b
+; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
 ; CHECK-NEXT:    mov z2.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
@@ -64,8 +64,8 @@ define <vscale x 8 x i8> @umulo_nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %
 ; CHECK-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    lsr z1.h, z2.h, #8
 ; CHECK-NEXT:    cmpne p1.h, p0/z, z0.h, #0
-; CHECK-NEXT:    cmpne p2.h, p0/z, z1.h, #0
-; CHECK-NEXT:    orr p0.b, p0/z, p2.b, p1.b
+; CHECK-NEXT:    cmpne p0.h, p0/z, z1.h, #0
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
 ; CHECK-NEXT:    mov z2.h, p0/m, #0 // =0x0
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
@@ -169,8 +169,8 @@ define <vscale x 2 x i16> @umulo_nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i1
 ; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    lsr z1.d, z2.d, #16
 ; CHECK-NEXT:    cmpne p1.d, p0/z, z0.d, #0
-; CHECK-NEXT:    cmpne p2.d, p0/z, z1.d, #0
-; CHECK-NEXT:    orr p0.b, p0/z, p2.b, p1.b
+; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
 ; CHECK-NEXT:    mov z2.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
@@ -194,8 +194,8 @@ define <vscale x 4 x i16> @umulo_nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x i1
 ; CHECK-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    lsr z1.s, z2.s, #16
 ; CHECK-NEXT:    cmpne p1.s, p0/z, z0.s, #0
-; CHECK-NEXT:    cmpne p2.s, p0/z, z1.s, #0
-; CHECK-NEXT:    orr p0.b, p0/z, p2.b, p1.b
+; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
 ; CHECK-NEXT:    mov z2.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
@@ -299,8 +299,8 @@ define <vscale x 2 x i32> @umulo_nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i3
 ; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    lsr z1.d, z2.d, #32
 ; CHECK-NEXT:    cmpne p1.d, p0/z, z0.d, #0
-; CHECK-NEXT:    cmpne p2.d, p0/z, z1.d, #0
-; CHECK-NEXT:    orr p0.b, p0/z, p2.b, p1.b
+; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
 ; CHECK-NEXT:    mov z2.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret