[llvm] 437a516 - [SME2/SVE2p1] Change psel intrinsic such that the result/first operand are not overloaded.

Mon May 22 07:13:48 PDT 2023

Author: Sander de Smalen
Date: 2023-05-22T13:52:06Z
New Revision: 437a516da8055c28c89dead53c012f71a4331eb9

URL: https://github.com/llvm/llvm-project/commit/437a516da8055c28c89dead53c012f71a4331eb9
DIFF: https://github.com/llvm/llvm-project/commit/437a516da8055c28c89dead53c012f71a4331eb9.diff

LOG: [SME2/SVE2p1] Change psel intrinsic such that the result/first operand are not overloaded.

All the bits of the first operand are copied to the destination register,
if the tested bit (in the second source operand) is active. This means we
copy over all vscale x 16 x i1's of the first operand. There is no need to
overload that type.

Reviewed By: david-arm

Differential Revision: https://reviews.llvm.org/D150958

Added: 
    

Modified: 
    llvm/include/llvm/IR/IntrinsicsAArch64.td
    llvm/lib/Target/AArch64/SMEInstrFormats.td
    llvm/test/CodeGen/AArch64/sve2-intrinsics-psel.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 2c3082c26049..d66505f1fa9d 100644

--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2792,9 +2792,9 @@ let TargetPrefix = "aarch64" in {
   //
 
   def int_aarch64_sve_psel
-      : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                              [LLVMMatchType<0>,
-                               LLVMMatchType<0>, llvm_i32_ty],
+      : DefaultAttrsIntrinsic<[llvm_nxv16i1_ty],
+                              [llvm_nxv16i1_ty,
+                               llvm_anyvector_ty, llvm_i32_ty],
                               [IntrNoMem]>;
 
   //

diff  --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index ee311ccdf322..6e3aadd5dd8c 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -1309,30 +1309,30 @@ multiclass sve2_int_perm_sel_p<string asm, SDPatternOperator op> {
                   (!cast<Instruction>(NAME # _D) PNRAny:$Pd,
                       PNRAny:$Pn, PPR64:$Pm, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_1:$imm), 0>;
 
-  def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv16i1 PPRAny:$Pm),
+  def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv16i1 PPR8:$Pm),
              MatrixIndexGPR32Op12_15:$idx)),
             (!cast<Instruction>(NAME # _B) $Pn, $Pm, $idx, 0)>;
-  def : Pat<(nxv8i1 (op (nxv8i1 PPRAny:$Pn), (nxv8i1 PPRAny:$Pm),
+  def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv8i1 PPR16:$Pm),
              MatrixIndexGPR32Op12_15:$idx)),
             (!cast<Instruction>(NAME # _H) $Pn, $Pm, $idx, 0)>;
-  def : Pat<(nxv4i1 (op (nxv4i1 PPRAny:$Pn), (nxv4i1 PPRAny:$Pm),
+  def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv4i1 PPR32:$Pm),
              MatrixIndexGPR32Op12_15:$idx)),
             (!cast<Instruction>(NAME # _S) $Pn, $Pm, $idx, 0)>;
-  def : Pat<(nxv2i1 (op (nxv2i1 PPRAny:$Pn), (nxv2i1 PPRAny:$Pm),
+  def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv2i1 PPR64:$Pm),
              MatrixIndexGPR32Op12_15:$idx)),
             (!cast<Instruction>(NAME # _D) $Pn, $Pm, $idx, 0)>;
 
   let AddedComplexity = 1 in {
-    def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv16i1 PPRAny:$Pm),
+    def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv16i1 PPR8:$Pm),
                (i32 (tileslice8 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_15:$imm)))),
               (!cast<Instruction>(NAME # _B) $Pn, $Pm, $idx, $imm)>;
-    def : Pat<(nxv8i1 (op (nxv8i1 PPRAny:$Pn), (nxv8i1 PPRAny:$Pm),
+    def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv8i1 PPR16:$Pm),
                (i32 (tileslice16 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_7:$imm)))),
               (!cast<Instruction>(NAME # _H) $Pn, $Pm, $idx, $imm)>;
-    def : Pat<(nxv4i1 (op (nxv4i1 PPRAny:$Pn), (nxv4i1 PPRAny:$Pm),
+    def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv4i1 PPR32:$Pm),
                (i32 (tileslice32 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_3:$imm)))),
               (!cast<Instruction>(NAME # _S) $Pn, $Pm, $idx, $imm)>;
-    def : Pat<(nxv2i1 (op (nxv2i1 PPRAny:$Pn), (nxv2i1 PPRAny:$Pm),
+    def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv2i1 PPR64:$Pm),
                (i32 (tileslice64 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_1:$imm)))),
               (!cast<Instruction>(NAME # _D) $Pn, $Pm, $idx, $imm)>;
   }

diff  --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-psel.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-psel.ll
index 0588b45cd4d5..4ce5d61ea77d 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-psel.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-psel.ll
@@ -22,70 +22,70 @@ define <vscale x 16 x i1> @psel_b_imm(<vscale x 16 x i1> %p1, <vscale x 16 x i1>
   ret <vscale x 16 x i1> %res
 }
 
-define <vscale x 8 x i1> @psel_h(<vscale x 8 x i1> %p1, <vscale x 8 x i1> %p2, i32 %idx) {
+define <vscale x 16 x i1> @psel_h(<vscale x 16 x i1> %p1, <vscale x 8 x i1> %p2, i32 %idx) {
 ; CHECK-LABEL: psel_h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    psel p0, p0, p1.h[w12, 0]
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i1> @llvm.aarch64.sve.psel.nxv8i1(<vscale x 8 x i1> %p1, <vscale x 8 x i1> %p2, i32 %idx)
-  ret <vscale x 8 x i1> %res
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv8i1(<vscale x 16 x i1> %p1, <vscale x 8 x i1> %p2, i32 %idx)
+  ret <vscale x 16 x i1> %res
 }
 
-define <vscale x 8 x i1> @psel_h_imm(<vscale x 8 x i1> %p1, <vscale x 8 x i1> %p2, i32 %idx) {
+define <vscale x 16 x i1> @psel_h_imm(<vscale x 16 x i1> %p1, <vscale x 8 x i1> %p2, i32 %idx) {
 ; CHECK-LABEL: psel_h_imm:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    psel p0, p0, p1.h[w12, 7]
 ; CHECK-NEXT:    ret
   %add = add i32 %idx, 7
-  %res = call <vscale x 8 x i1> @llvm.aarch64.sve.psel.nxv8i1(<vscale x 8 x i1> %p1, <vscale x 8 x i1> %p2, i32 %add)
-  ret <vscale x 8 x i1> %res
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv8i1(<vscale x 16 x i1> %p1, <vscale x 8 x i1> %p2, i32 %add)
+  ret <vscale x 16 x i1> %res
 }
 
-define <vscale x 4 x i1> @psel_s(<vscale x 4 x i1> %p1, <vscale x 4 x i1> %p2, i32 %idx) {
+define <vscale x 16 x i1> @psel_s(<vscale x 16 x i1> %p1, <vscale x 4 x i1> %p2, i32 %idx) {
 ; CHECK-LABEL: psel_s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    psel p0, p0, p1.s[w12, 0]
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i1> @llvm.aarch64.sve.psel.nxv4i1(<vscale x 4 x i1> %p1, <vscale x 4 x i1> %p2, i32 %idx)
-  ret <vscale x 4 x i1> %res
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv4i1(<vscale x 16 x i1> %p1, <vscale x 4 x i1> %p2, i32 %idx)
+  ret <vscale x 16 x i1> %res
 }
 
-define <vscale x 4 x i1> @psel_s_imm(<vscale x 4 x i1> %p1, <vscale x 4 x i1> %p2, i32 %idx) {
+define <vscale x 16 x i1> @psel_s_imm(<vscale x 16 x i1> %p1, <vscale x 4 x i1> %p2, i32 %idx) {
 ; CHECK-LABEL: psel_s_imm:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    psel p0, p0, p1.s[w12, 3]
 ; CHECK-NEXT:    ret
   %add = add i32 %idx, 3
-  %res = call <vscale x 4 x i1> @llvm.aarch64.sve.psel.nxv4i1(<vscale x 4 x i1> %p1, <vscale x 4 x i1> %p2, i32 %add)
-  ret <vscale x 4 x i1> %res
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv4i1(<vscale x 16 x i1> %p1, <vscale x 4 x i1> %p2, i32 %add)
+  ret <vscale x 16 x i1> %res
 }
 
-define <vscale x 2 x i1> @psel_d(<vscale x 2 x i1> %p1, <vscale x 2 x i1> %p2, i32 %idx) {
+define <vscale x 16 x i1> @psel_d(<vscale x 16 x i1> %p1, <vscale x 2 x i1> %p2, i32 %idx) {
 ; CHECK-LABEL: psel_d:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    psel p0, p0, p1.d[w12, 0]
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i1> @llvm.aarch64.sve.psel.nxv2i1(<vscale x 2 x i1> %p1, <vscale x 2 x i1> %p2, i32 %idx)
-  ret <vscale x 2 x i1> %res
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv2i1(<vscale x 16 x i1> %p1, <vscale x 2 x i1> %p2, i32 %idx)
+  ret <vscale x 16 x i1> %res
 }
 
-define <vscale x 2 x i1> @psel_d_imm(<vscale x 2 x i1> %p1, <vscale x 2 x i1> %p2, i32 %idx) {
+define <vscale x 16 x i1> @psel_d_imm(<vscale x 16 x i1> %p1, <vscale x 2 x i1> %p2, i32 %idx) {
 ; CHECK-LABEL: psel_d_imm:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    psel p0, p0, p1.d[w12, 1]
 ; CHECK-NEXT:    ret
   %add = add i32 %idx, 1
-  %res = call <vscale x 2 x i1> @llvm.aarch64.sve.psel.nxv2i1(<vscale x 2 x i1> %p1, <vscale x 2 x i1> %p2, i32 %add)
-  ret <vscale x 2 x i1> %res
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv2i1(<vscale x 16 x i1> %p1, <vscale x 2 x i1> %p2, i32 %add)
+  ret <vscale x 16 x i1> %res
 }
 
 declare <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
-declare <vscale x 8 x i1>  @llvm.aarch64.sve.psel.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
-declare <vscale x 4 x i1>  @llvm.aarch64.sve.psel.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
-declare <vscale x 2 x i1>  @llvm.aarch64.sve.psel.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
+declare <vscale x 16 x i1>  @llvm.aarch64.sve.psel.nxv8i1(<vscale x 16 x i1>, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i1>  @llvm.aarch64.sve.psel.nxv4i1(<vscale x 16 x i1>, <vscale x 4 x i1>, i32)
+declare <vscale x 16 x i1>  @llvm.aarch64.sve.psel.nxv2i1(<vscale x 16 x i1>, <vscale x 2 x i1>, i32)