[llvm] 6333779 - [AArch64][SME2] Add multi-vector uunpk and sunpk intrinsics

Tue Feb 28 06:02:07 PST 2023

Author: David Sherwood
Date: 2023-02-28T14:01:38Z
New Revision: 6333779ac082498849e7a25eba5cdd2fd6b5a1fb

URL: https://github.com/llvm/llvm-project/commit/6333779ac082498849e7a25eba5cdd2fd6b5a1fb
DIFF: https://github.com/llvm/llvm-project/commit/6333779ac082498849e7a25eba5cdd2fd6b5a1fb.diff

LOG: [AArch64][SME2] Add multi-vector uunpk and sunpk intrinsics

This patch adds the LLVM IR intrinsics for the following:

* uunpk (2 and 4 vectors)
* sunpk (2 and 4 vectors)

I have named the tests sve2p1-intrinsics-* because although
the instructions are added as part of the SME2 feature they
only operate on SVE vectors.

NOTE: These intrinsics are still in development and are subject to future changes.

Differential Revision: https://reviews.llvm.org/D142964

Added: 
    llvm/test/CodeGen/AArch64/sve2p1-intrinsics-unpk.ll

Modified: 
    llvm/include/llvm/IR/IntrinsicsAArch64.td
    llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 418ef069d31ac..eedc4923b3826 100644

--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2987,6 +2987,15 @@ let TargetPrefix = "aarch64" in {
                 [LLVMMatchType<0>, LLVMMatchType<0>,
                  LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
 
+  class SME2_VG2_Unpk_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                [LLVMSubdivide2VectorType<0>], [IntrNoMem]>;
+
+  class SME2_VG4_Unpk_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                             LLVMMatchType<0>, LLVMMatchType<0>],
+                [LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>],
+                [IntrNoMem]>;
 
   //
   // Multi-vector fused multiply-add/subtract
@@ -3320,4 +3329,12 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sve_sdot_lane_x2 : SVE2_3VectorArgIndexed_Long_Intrinsic;
   def int_aarch64_sve_udot_lane_x2 : SVE2_3VectorArgIndexed_Long_Intrinsic;
   def int_aarch64_sve_fdot_lane_x2 : SVE2_3VectorArgIndexed_Long_Intrinsic;
+
+  //
+  // Signed/unsigned multi-vector unpacks
+  //
+  def int_aarch64_sve_sunpk_x2 : SME2_VG2_Unpk_Intrinsic;
+  def int_aarch64_sve_uunpk_x2 : SME2_VG2_Unpk_Intrinsic;
+  def int_aarch64_sve_sunpk_x4 : SME2_VG4_Unpk_Intrinsic;
+  def int_aarch64_sve_uunpk_x4 : SME2_VG4_Unpk_Intrinsic;
 }

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 87a3d0a2eadc8..7fed41882f15e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -5468,6 +5468,34 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::aarch64_sve_frintp_x4:
       SelectFrintFromVT(Node, 4, AArch64::FRINTP_4Z4Z_S);
       return;
+    case Intrinsic::aarch64_sve_sunpk_x2:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
+              Node->getValueType(0),
+              {0, AArch64::SUNPK_VG2_2ZZ_H, AArch64::SUNPK_VG2_2ZZ_S,
+               AArch64::SUNPK_VG2_2ZZ_D}))
+        SelectUnaryMultiIntrinsic(Node, 2, /*IsTupleInput=*/false, Op);
+      return;
+    case Intrinsic::aarch64_sve_uunpk_x2:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
+              Node->getValueType(0),
+              {0, AArch64::UUNPK_VG2_2ZZ_H, AArch64::UUNPK_VG2_2ZZ_S,
+               AArch64::UUNPK_VG2_2ZZ_D}))
+        SelectUnaryMultiIntrinsic(Node, 2, /*IsTupleInput=*/false, Op);
+      return;
+    case Intrinsic::aarch64_sve_sunpk_x4:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
+              Node->getValueType(0),
+              {0, AArch64::SUNPK_VG4_4Z2Z_H, AArch64::SUNPK_VG4_4Z2Z_S,
+               AArch64::SUNPK_VG4_4Z2Z_D}))
+        SelectUnaryMultiIntrinsic(Node, 4, /*IsTupleInput=*/true, Op);
+      return;
+    case Intrinsic::aarch64_sve_uunpk_x4:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
+              Node->getValueType(0),
+              {0, AArch64::UUNPK_VG4_4Z2Z_H, AArch64::UUNPK_VG4_4Z2Z_S,
+               AArch64::UUNPK_VG4_4Z2Z_D}))
+        SelectUnaryMultiIntrinsic(Node, 4, /*IsTupleInput=*/true, Op);
+      return;
     }
     break;
   }

diff  --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-unpk.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-unpk.ll
new file mode 100644
index 0000000000000..8334bbe0cff30
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-unpk.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+
+
+; == 2 vectors ==
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16> } @test_unpk_s16_x2(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %a) {
+; CHECK-LABEL: test_unpk_s16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sunpk { z0.h, z1.h }, z1.b
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sunpk.x2.nxv8i16(<vscale x 16 x i8> %a)
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32> } @test_unpk_s32_x2(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: test_unpk_s32_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sunpk { z0.s, z1.s }, z1.h
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sunpk.x2.nxv4i32(<vscale x 8 x i16> %a)
+  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64> } @test_unpk_s64_x2(<vscale x 4 x i32> %unusued, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: test_unpk_s64_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sunpk { z0.d, z1.d }, z1.s
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sunpk.x2.nxv2i64(<vscale x 4 x i32> %a)
+  ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16> } @test_unpk_u16_x2(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %a) {
+; CHECK-LABEL: test_unpk_u16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpk { z0.h, z1.h }, z1.b
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uunpk.x2.nxv8i16(<vscale x 16 x i8> %a)
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32> } @test_unpk_u32_x2(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: test_unpk_u32_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpk { z0.s, z1.s }, z1.h
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uunpk.x2.nxv4i32(<vscale x 8 x i16> %a)
+  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64> } @test_unpk_u64_x2(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: test_unpk_u64_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpk { z0.d, z1.d }, z1.s
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uunpk.x2.nxv2i64(<vscale x 4 x i32> %a)
+  ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+
+; == 4 vectors ==
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_unpk_s16_x4(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_unpk_s16_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    sunpk { z0.h - z3.h }, { z2.b, z3.b }
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sunpk.x4.nxv8i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_unpk_s32(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: test_unpk_s32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    sunpk { z0.s - z3.s }, { z2.h, z3.h }
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sunpk.x4.nxv4i32(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
+  ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_unpk_s64(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_unpk_s64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    sunpk { z0.d - z3.d }, { z2.s, z3.s }
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sunpk.x4.nxv2i64(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_unpk_u16_x4(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_unpk_u16_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    uunpk { z0.h - z3.h }, { z2.b, z3.b }
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uunpk.x4.nxv8i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_unpk_u32(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: test_unpk_u32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    uunpk { z0.s - z3.s }, { z2.h, z3.h }
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uunpk.x4.nxv4i32(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
+  ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_unpk_u64(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_unpk_u64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    uunpk { z0.d - z3.d }, { z2.s, z3.s }
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uunpk.x4.nxv2i64(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+
+
+; == 2 vectors ==
+declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sunpk.x2.nxv8i16(<vscale x 16 x i8>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sunpk.x2.nxv4i32(<vscale x 8 x i16>)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sunpk.x2.nxv2i64(<vscale x 4 x i32>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uunpk.x2.nxv8i16(<vscale x 16 x i8>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uunpk.x2.nxv4i32(<vscale x 8 x i16>)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uunpk.x2.nxv2i64(<vscale x 4 x i32>)
+
+; == 4 vectors ==
+declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sunpk.x4.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sunpk.x4.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sunpk.x4.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uunpk.x4.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uunpk.x4.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uunpk.x4.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)