[llvm] 74d45b8 - [VP] Binary floating-point intrinsics.
Simon Moll via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 14 00:45:51 PDT 2021
Author: Simon Moll
Date: 2021-06-14T08:51:41+02:00
New Revision: 74d45b884cfb13d8530a64ba538817bdab165c12
URL: https://github.com/llvm/llvm-project/commit/74d45b884cfb13d8530a64ba538817bdab165c12
DIFF: https://github.com/llvm/llvm-project/commit/74d45b884cfb13d8530a64ba538817bdab165c12.diff
LOG: [VP] Binary floating-point intrinsics.
This patch implements vector-predicated intrinsics on IR level for fadd,
fsub, fmul, fdiv and frem. There operate in the default floating-point
environment. We will use constrained fp operand bundles for constrained
vector-predicated fp math (D93455).
Reviewed By: craig.topper
Differential Revision: https://reviews.llvm.org/D93470
Added:
Modified:
llvm/docs/LangRef.rst
llvm/include/llvm/IR/Intrinsics.td
llvm/include/llvm/IR/VPIntrinsics.def
llvm/test/Verifier/vp-intrinsics.ll
llvm/unittests/IR/VPIntrinsicTest.cpp
Removed:
################################################################################
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index a05087a4f67c..f6a01040aee6 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -18227,6 +18227,252 @@ Examples:
%also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef
+.. _int_vp_fadd:
+
+'``llvm.vp.fadd.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+ declare <16 x float> @llvm.vp.fadd.v16f32 (<16 x float> <left_op>, <16 x float> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+ declare <vscale x 4 x float> @llvm.vp.fadd.nxv4f32 (<vscale x 4 x float> <left_op>, <vscale x 4 x float> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+ declare <256 x double> @llvm.vp.fadd.v256f64 (<256 x double> <left_op>, <256 x double> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated floating-point addition of two vectors of floating-point values.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of floating-point type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.fadd``' intrinsic performs floating-point addition (:ref:`add <i_fadd>`)
+of the first and second vector operand on each enabled lane. The result on
+disabled lanes is undefined. The operation is performed in the default
+floating-point environment.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+ %r = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> %a, <4 x float> %b, <4 x i1> %mask, i32 %evl)
+ ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+ %t = fadd <4 x float> %a, %b
+ %also.r = select <4 x i1> %mask, <4 x float> %t, <4 x float> undef
+
+
+.. _int_vp_fsub:
+
+'``llvm.vp.fsub.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+ declare <16 x float> @llvm.vp.fsub.v16f32 (<16 x float> <left_op>, <16 x float> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+ declare <vscale x 4 x float> @llvm.vp.fsub.nxv4f32 (<vscale x 4 x float> <left_op>, <vscale x 4 x float> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+ declare <256 x double> @llvm.vp.fsub.v256f64 (<256 x double> <left_op>, <256 x double> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated floating-point subtraction of two vectors of floating-point values.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of floating-point type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.fsub``' intrinsic performs floating-point subtraction (:ref:`add <i_fsub>`)
+of the first and second vector operand on each enabled lane. The result on
+disabled lanes is undefined. The operation is performed in the default
+floating-point environment.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+ %r = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> %a, <4 x float> %b, <4 x i1> %mask, i32 %evl)
+ ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+ %t = fsub <4 x float> %a, %b
+ %also.r = select <4 x i1> %mask, <4 x float> %t, <4 x float> undef
+
+
+.. _int_vp_fmul:
+
+'``llvm.vp.fmul.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+ declare <16 x float> @llvm.vp.fmul.v16f32 (<16 x float> <left_op>, <16 x float> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+ declare <vscale x 4 x float> @llvm.vp.fmul.nxv4f32 (<vscale x 4 x float> <left_op>, <vscale x 4 x float> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+ declare <256 x double> @llvm.vp.fmul.v256f64 (<256 x double> <left_op>, <256 x double> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated floating-point multiplication of two vectors of floating-point values.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of floating-point type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.fmul``' intrinsic performs floating-point multiplication (:ref:`add <i_fmul>`)
+of the first and second vector operand on each enabled lane. The result on
+disabled lanes is undefined. The operation is performed in the default
+floating-point environment.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+ %r = call <4 x float> @llvm.vp.fmul.v4f32(<4 x float> %a, <4 x float> %b, <4 x i1> %mask, i32 %evl)
+ ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+ %t = fmul <4 x float> %a, %b
+ %also.r = select <4 x i1> %mask, <4 x float> %t, <4 x float> undef
+
+
+.. _int_vp_fdiv:
+
+'``llvm.vp.fdiv.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+ declare <16 x float> @llvm.vp.fdiv.v16f32 (<16 x float> <left_op>, <16 x float> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+ declare <vscale x 4 x float> @llvm.vp.fdiv.nxv4f32 (<vscale x 4 x float> <left_op>, <vscale x 4 x float> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+ declare <256 x double> @llvm.vp.fdiv.v256f64 (<256 x double> <left_op>, <256 x double> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated floating-point division of two vectors of floating-point values.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of floating-point type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.fdiv``' intrinsic performs floating-point division (:ref:`add <i_fdiv>`)
+of the first and second vector operand on each enabled lane. The result on
+disabled lanes is undefined. The operation is performed in the default
+floating-point environment.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+ %r = call <4 x float> @llvm.vp.fdiv.v4f32(<4 x float> %a, <4 x float> %b, <4 x i1> %mask, i32 %evl)
+ ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+ %t = fdiv <4 x float> %a, %b
+ %also.r = select <4 x i1> %mask, <4 x float> %t, <4 x float> undef
+
+
+.. _int_vp_frem:
+
+'``llvm.vp.frem.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+ declare <16 x float> @llvm.vp.frem.v16f32 (<16 x float> <left_op>, <16 x float> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+ declare <vscale x 4 x float> @llvm.vp.frem.nxv4f32 (<vscale x 4 x float> <left_op>, <vscale x 4 x float> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+ declare <256 x double> @llvm.vp.frem.v256f64 (<256 x double> <left_op>, <256 x double> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated floating-point remainder of two vectors of floating-point values.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of floating-point type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.frem``' intrinsic performs floating-point remainder (:ref:`add <i_frem>`)
+of the first and second vector operand on each enabled lane. The result on
+disabled lanes is undefined. The operation is performed in the default
+floating-point environment.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+ %r = call <4 x float> @llvm.vp.frem.v4f32(<4 x float> %a, <4 x float> %b, <4 x i1> %mask, i32 %evl)
+ ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+ %t = frem <4 x float> %a, %b
+ %also.r = select <4 x i1> %mask, <4 x float> %t, <4 x float> undef
+
+
+
.. _int_get_active_lane_mask:
'``llvm.get.active.lane.mask.*``' Intrinsics
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index f7c0bb0b88a9..ea4f965780f8 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1439,6 +1439,36 @@ let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
llvm_i32_ty]>;
}
+// Floating-point arithmetic.
+let IntrProperties =
+ [IntrSpeculatable, IntrNoMem, IntrNoSync, IntrWillReturn] in {
+ def int_vp_fadd : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+ [ LLVMMatchType<0>,
+ LLVMMatchType<0>,
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+ llvm_i32_ty]>;
+ def int_vp_fsub : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+ [ LLVMMatchType<0>,
+ LLVMMatchType<0>,
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+ llvm_i32_ty]>;
+ def int_vp_fmul : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+ [ LLVMMatchType<0>,
+ LLVMMatchType<0>,
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+ llvm_i32_ty]>;
+ def int_vp_fdiv : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+ [ LLVMMatchType<0>,
+ LLVMMatchType<0>,
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+ llvm_i32_ty]>;
+ def int_vp_frem : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+ [ LLVMMatchType<0>,
+ LLVMMatchType<0>,
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+ llvm_i32_ty]>;
+}
+
def int_get_active_lane_mask:
DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[llvm_anyint_ty, LLVMMatchType<1>],
diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
index 37a079512457..f6a283c800eb 100644
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -89,6 +89,17 @@ END_REGISTER_VP_SDNODE(SDOPC)
#define HANDLE_VP_TO_OPC(OPC)
#endif
+// Whether the intrinsic may have a rounding mode or exception behavior operand
+// bundle.
+// \p HASROUND '1' if the intrinsic can have a rounding mode operand bundle,
+// '0' otherwise.
+// \p HASEXCEPT '1' if the intrinsic can have an exception behavior operand
+// bundle, '0' otherwise.
+// \p INTRINID The constrained fp intrinsic this VP intrinsic corresponds to.
+#ifndef HANDLE_VP_TO_CONSTRAINEDFP
+#define HANDLE_VP_TO_CONSTRAINEDFP(HASROUND, HASEXCEPT, INTRINID)
+#endif
+
/// } Property Macros
///// Integer Arithmetic {
@@ -147,6 +158,38 @@ HELPER_REGISTER_BINARY_INT_VP(vp_xor, VP_XOR, Xor)
///// } Integer Arithmetic
+///// Floating-Point Arithmetic {
+
+// Specialized helper macro for floating-point binary operators
+// <operation>(%x, %y, %mask, %evl).
+#ifdef HELPER_REGISTER_BINARY_FP_VP
+#error \
+ "The internal helper macro HELPER_REGISTER_BINARY_FP_VP is already defined!"
+#endif
+#define HELPER_REGISTER_BINARY_FP_VP(OPSUFFIX, SDOPC, OPC) \
+ BEGIN_REGISTER_VP(vp_##OPSUFFIX, 2, 3, SDOPC, -1) \
+ HANDLE_VP_TO_OPC(OPC) \
+ HANDLE_VP_TO_CONSTRAINEDFP(1, 1, experimental_constrained_##OPSUFFIX) \
+ END_REGISTER_VP(vp_##OPSUFFIX, SDOPC)
+
+// llvm.vp.fadd(x,y,mask,vlen)
+HELPER_REGISTER_BINARY_FP_VP(fadd, VP_FADD, FAdd)
+
+// llvm.vp.fsub(x,y,mask,vlen)
+HELPER_REGISTER_BINARY_FP_VP(fsub, VP_FSUB, FSub)
+
+// llvm.vp.fmul(x,y,mask,vlen)
+HELPER_REGISTER_BINARY_FP_VP(fmul, VP_FMUL, FMul)
+
+// llvm.vp.fdiv(x,y,mask,vlen)
+HELPER_REGISTER_BINARY_FP_VP(fdiv, VP_FDIV, FDiv)
+
+// llvm.vp.frem(x,y,mask,vlen)
+HELPER_REGISTER_BINARY_FP_VP(frem, VP_FREM, FRem)
+
+#undef HELPER_REGISTER_BINARY_FP_VP
+
+///// } Floating-Point Arithmetic
#undef BEGIN_REGISTER_VP
#undef BEGIN_REGISTER_VP_INTRINSIC
@@ -155,3 +198,4 @@ HELPER_REGISTER_BINARY_INT_VP(vp_xor, VP_XOR, Xor)
#undef END_REGISTER_VP_INTRINSIC
#undef END_REGISTER_VP_SDNODE
#undef HANDLE_VP_TO_OPC
+#undef HANDLE_VP_TO_CONSTRAINEDFP
diff --git a/llvm/test/Verifier/vp-intrinsics.ll b/llvm/test/Verifier/vp-intrinsics.ll
index 0e9f4e01561d..971d9b767ef2 100644
--- a/llvm/test/Verifier/vp-intrinsics.ll
+++ b/llvm/test/Verifier/vp-intrinsics.ll
@@ -17,6 +17,18 @@ define void @test_vp_int(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) {
ret void
}
+
+define void @test_vp_fp(<8 x double> %f0, <8 x double> %f1, <8 x i1> %m, i32 %n) {
+ %r0 = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> %f0, <8 x double> %f1, <8 x i1> %m, i32 %n)
+ %r1 = call <8 x double> @llvm.vp.fsub.v8f64(<8 x double> %f0, <8 x double> %f1, <8 x i1> %m, i32 %n)
+ %r2 = call <8 x double> @llvm.vp.fmul.v8f64(<8 x double> %f0, <8 x double> %f1, <8 x i1> %m, i32 %n)
+ %r3 = call <8 x double> @llvm.vp.fdiv.v8f64(<8 x double> %f0, <8 x double> %f1, <8 x i1> %m, i32 %n)
+ %r4 = call <8 x double> @llvm.vp.frem.v8f64(<8 x double> %f0, <8 x double> %f1, <8 x i1> %m, i32 %n)
+ ret void
+}
+
+; TODO: test_vp_constrained_fp
+
; integer arith
declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
@@ -32,3 +44,9 @@ declare <8 x i32> @llvm.vp.xor.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
declare <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
declare <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
declare <8 x i32> @llvm.vp.shl.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+; fp arith
+declare <8 x double> @llvm.vp.fadd.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32)
+declare <8 x double> @llvm.vp.fsub.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32)
+declare <8 x double> @llvm.vp.fmul.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32)
+declare <8 x double> @llvm.vp.fdiv.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32)
+declare <8 x double> @llvm.vp.frem.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32)
diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp
index f16e55e6f247..ad428b8ab428 100644
--- a/llvm/unittests/IR/VPIntrinsicTest.cpp
+++ b/llvm/unittests/IR/VPIntrinsicTest.cpp
@@ -41,6 +41,11 @@ class VPIntrinsicTest : public testing::Test {
Str << " declare <8 x i32> @llvm.vp." << BinaryIntOpcode
<< ".v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) ";
+ const char *BinaryFPOpcodes[] = {"fadd", "fsub", "fmul", "fdiv", "frem"};
+ for (const char *BinaryFPOpcode : BinaryFPOpcodes)
+ Str << " declare <8 x float> @llvm.vp." << BinaryFPOpcode
+ << ".v8f32(<8 x float>, <8 x float>, <8 x i1>, i32) ";
+
return parseAssemblyString(Str.str(), Err, C);
}
};
@@ -131,7 +136,8 @@ TEST_F(VPIntrinsicTest, CanIgnoreVectorLength) {
assert(F);
const int NumExpected = 12;
- const bool Expected[] = {false, true, false, false, false, true, false, false, true, false, true, false};
+ const bool Expected[] = {false, true, false, false, false, true,
+ false, false, true, false, true, false};
int i = 0;
for (auto &I : F->getEntryBlock()) {
VPIntrinsic *VPI = dyn_cast<VPIntrinsic>(&I);
@@ -157,7 +163,8 @@ TEST_F(VPIntrinsicTest, GetParamPos) {
if (MaskParamPos.hasValue()) {
Type *MaskParamType = F.getArg(MaskParamPos.getValue())->getType();
ASSERT_TRUE(MaskParamType->isVectorTy());
- ASSERT_TRUE(cast<VectorType>(MaskParamType)->getElementType()->isIntegerTy(1));
+ ASSERT_TRUE(
+ cast<VectorType>(MaskParamType)->getElementType()->isIntegerTy(1));
}
Optional<unsigned> VecLenParamPos =
@@ -254,4 +261,19 @@ TEST_F(VPIntrinsicTest, VPIntrinsicDeclarationForParams) {
}
}
+/// Check that the HANDLE_VP_TO_CONSTRAINEDFP maps to an existing intrinsic with
+/// the right amount of metadata args.
+TEST_F(VPIntrinsicTest, HandleToConstrainedFP) {
+#define HANDLE_VP_TO_CONSTRAINEDFP(HASROUND, HASEXCEPT, CFPID) \
+ { \
+ SmallVector<Intrinsic::IITDescriptor, 5> T; \
+ Intrinsic::getIntrinsicInfoTableEntries(Intrinsic::CFPID, T); \
+ unsigned NumMetadataArgs = 0; \
+ for (auto TD : T) \
+ NumMetadataArgs += (TD.Kind == Intrinsic::IITDescriptor::Metadata); \
+ ASSERT_EQ(NumMetadataArgs, (unsigned)(HASROUND + HASEXCEPT)); \
+ }
+#include "llvm/IR/VPIntrinsics.def"
+}
+
} // end anonymous namespace
More information about the llvm-commits
mailing list