[llvm] 72a08c0 - [VP] Vector predicated vector splice intrinsic

Wed Sep 29 01:44:28 PDT 2021

Author: Simon Moll
Date: 2021-09-29T10:43:36+02:00
New Revision: 72a08c0b940450424b4c5d5babfb374df14ef471

URL: https://github.com/llvm/llvm-project/commit/72a08c0b940450424b4c5d5babfb374df14ef471
DIFF: https://github.com/llvm/llvm-project/commit/72a08c0b940450424b4c5d5babfb374df14ef471.diff

LOG: [VP] Vector predicated vector splice intrinsic

This patch introduces the vector-predicated version of the
experimental_vector_splice intrinsic [1] at the IR level. It considers
the active vector length for both vectors and and uses a vector mask to
disable certain lanes in the result.

[1] https://reviews.llvm.org/D94708

Change originally authored by Vineet Kumar <vineet.kumar at bsc.es>

Reviewed By: simoll

Differential Revision: https://reviews.llvm.org/D103898

Added: 
    

Modified: 
    llvm/docs/LangRef.rst
    llvm/include/llvm/IR/Intrinsics.td
    llvm/include/llvm/IR/VPIntrinsics.def
    llvm/test/Verifier/vp-intrinsics.ll
    llvm/unittests/IR/VPIntrinsicTest.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index a40a1618efe28..51558ddb7ff85 100644

--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19541,6 +19541,64 @@ Examples:
       %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 
 
+.. _int_experimental_vp_splice:
+
+'``llvm.experimental.vp.splice``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <2 x double> @llvm.experimental.vp.splice.v2f64(<2 x double> %vec1, <2 x double> %vec2, i32 %imm, <2 x i1> %mask, i32 %evl1, i32 %evl2)
+      declare <vscale x 4 x i32> @llvm.experimental.vp.splice.nxv4i32(<vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2, i32 %imm, <2 x i1> %mask i32 %evl1, i32 %evl2)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vp.splice.*``' intrinsic is the vector length
+predicated version of the '``llvm.experimental.vector.splice.*``' intrinsic.
+
+Arguments:
+""""""""""
+
+The result and the first two arguments ``vec1`` and ``vec2`` are vectors with
+the same type.  The third argument ``imm`` is an immediate signed integer that
+indicates the offset index.  The fourth argument ``mask`` is a vector mask and
+has the same number of elements as the result.  The last two arguments ``evl1``
+and ``evl2`` are unsigned integers indicating the explicit vector lengths of
+``vec1`` and ``vec2`` respectively.  ``imm``, ``evl1`` and ``evl2`` should
+respect the following constraints: ``-evl1 <= imm < evl1``, ``0 <= evl1 <= VL``
+and ``0 <= evl2 <= VL``, where ``VL`` is the runtime vector factor. If these
+constraints are not satisfied the intrinsic has undefined behaviour.
+
+Semantics:
+""""""""""
+
+Effectively, this intrinsic concatenates ``vec1[0..evl1-1]`` and
+``vec2[0..evl2-1]`` and creates the result vector by selecting the elements in a
+window of size ``evl2``, starting at index ``imm`` (for a positive immediate) of
+the concatenated vector. Elements in the result vector beyond ``evl2`` are
+``undef``.  If ``imm`` is negative the starting index is ``evl1 + imm``.  The result
+vector of active vector length ``evl2`` contains ``evl1 - imm`` (``-imm`` for
+negative ``imm``) elements from indices ``[imm..evl1 - 1]``
+(``[evl1 + imm..evl1 -1]`` for negative ``imm``) of ``vec1`` followed by the
+first ``evl2 - (evl1 - imm)`` (``evl2 + imm`` for negative ``imm``) elements of
+``vec2``. If ``evl1 - imm`` (``-imm``) >= ``evl2``, only the first ``evl2``
+elements are considered and the remaining are ``undef``.  The lanes in the result
+vector disabled by ``mask`` are ``undef``.
+
+Examples:
+"""""""""
+
+.. code-block:: text
+
+ llvm.experimental.vp.splice(<A,B,C,D>, <E,F,G,H>, 1, 2, 3)  ==> <B, E, F, undef> ; index
+ llvm.experimental.vp.splice(<A,B,C,D>, <E,F,G,H>, -2, 3, 2) ==> <B, C, undef, undef> ; trailing elements
+
+
 .. _int_mload_mstore:
 
 Masked Vector Load and Store Intrinsics

diff  --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 5032c7c374014..9c51a2f2b7ea3 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1580,6 +1580,15 @@ def int_get_active_lane_mask:
             [llvm_anyint_ty, LLVMMatchType<1>],
             [IntrNoMem, IntrNoSync, IntrWillReturn]>;
 
+def int_experimental_vp_splice:
+  DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+            [LLVMMatchType<0>,
+             LLVMMatchType<0>,
+             llvm_i32_ty,
+             LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+             llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+
 //===-------------------------- Masked Intrinsics -------------------------===//
 //
 def int_masked_load:

diff  --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
index 5248c0993dd0e..361d6357b303b 100644
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -356,6 +356,10 @@ BEGIN_REGISTER_VP_INTRINSIC(vp_select, 0, 3)
 // END_REGISTER_CASES(vp_select, VP_SELECT)
 END_REGISTER_VP_INTRINSIC(vp_select)
 
+BEGIN_REGISTER_VP(experimental_vp_splice, 3, 5,
+                  EXPERIMENTAL_VP_SPLICE, -1)
+END_REGISTER_VP(experimental_vp_splice, EXPERIMENTAL_VP_SPLICE)
+
 ///// } Shuffles
 
 #undef BEGIN_REGISTER_VP

diff  --git a/llvm/test/Verifier/vp-intrinsics.ll b/llvm/test/Verifier/vp-intrinsics.ll
index c77ba08321f94..cd43da0d182bf 100644
--- a/llvm/test/Verifier/vp-intrinsics.ll
+++ b/llvm/test/Verifier/vp-intrinsics.ll
@@ -47,6 +47,16 @@ define void @test_vp_reduction(i32 %x, <8 x i32> %vi, <8 x float> %vf, float %f,
   ret void
 }
 
+define void @test_vp_splice0(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %l0, i32 %l1) {
+  %r0 = call <8 x i32> @llvm.experimental.vp.splice.v8i32(<8 x i32> %i0, <8 x i32> %i1, i32 2, <8 x i1> %m, i32 %l0, i32 %l1)
+  ret void
+}
+
+define void @test_vp_splice1(<vscale x 8 x i32> %i0, <vscale x 8 x i32> %i1, <vscale x 8 x i1> %m, i32 %l0, i32 %l1) {
+  %r0 = call <vscale x 8 x i32> @llvm.experimental.vp.splice.nxv8i32(<vscale x 8 x i32> %i0, <vscale x 8 x i32> %i1, i32 -1, <vscale x 8 x i1> %m, i32 %l0, i32 %l1)
+  ret void
+}
+
 ; integer arith
 declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
 declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
@@ -82,3 +92,6 @@ declare float @llvm.vp.reduce.fmin.v8f32(float, <8 x float>, <8 x i1>, i32)
 declare float @llvm.vp.reduce.fmax.v8f32(float, <8 x float>, <8 x i1>, i32)
 declare float @llvm.vp.reduce.fadd.v8f32(float, <8 x float>, <8 x i1>, i32)
 declare float @llvm.vp.reduce.fmul.v8f32(float, <8 x float>, <8 x i1>, i32)
+; shuffles
+declare <8 x i32> @llvm.experimental.vp.splice.v8i32(<8 x i32>, <8 x i32>, i32, <8 x i1>, i32, i32)
+declare <vscale x 8 x i32> @llvm.experimental.vp.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32, <vscale x 8 x i1>, i32, i32)

diff  --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp
index 32a0726b5caf9..4a0a1f330e661 100644
--- a/llvm/unittests/IR/VPIntrinsicTest.cpp
+++ b/llvm/unittests/IR/VPIntrinsicTest.cpp
@@ -70,6 +70,9 @@ class VPIntrinsicTest : public testing::Test {
 
     Str << " declare <8 x i32> @llvm.vp.select.v8i32(<8 x i1>, <8 x i32>, <8 x "
            "i32>, i32)";
+    Str << " declare <8 x i32> @llvm.experimental.vp.splice.v8i32(<8 x "
+           "i32>, <8 x i32>, i32, <8 x i1>, i32, i32) ";
+
     return parseAssemblyString(Str.str(), Err, C);
   }
 };