r345344 - [AArch64] Implement FP16FML intrinsics
Bryan Chan via cfe-commits
cfe-commits at lists.llvm.org
Thu Oct 25 16:47:00 PDT 2018
Author: bryanpkc
Date: Thu Oct 25 16:47:00 2018
New Revision: 345344
URL: http://llvm.org/viewvc/llvm-project?rev=345344&view=rev
Log:
[AArch64] Implement FP16FML intrinsics
Generate the FP16FML intrinsics into arm_neon.h (AArch64 only for now).
Add two new type modifiers to NeonEmitter to handle the new prototypes.
Define __ARM_FEATURE_FP16FML when +fp16fml is enabled and guard the
intrinsics with the macro in arm_neon.h.
Based on a patch by Gao Yiling.
Differential Revision: https://reviews.llvm.org/D53633
Added:
cfe/trunk/test/CodeGen/aarch64-neon-fp16fml.c
Modified:
cfe/trunk/include/clang/Basic/arm_neon.td
cfe/trunk/include/clang/Basic/arm_neon_incl.td
cfe/trunk/lib/Basic/Targets/AArch64.cpp
cfe/trunk/lib/Basic/Targets/AArch64.h
cfe/trunk/lib/CodeGen/CGBuiltin.cpp
cfe/trunk/test/Preprocessor/aarch64-target-features.c
cfe/trunk/utils/TableGen/NeonEmitter.cpp
Modified: cfe/trunk/include/clang/Basic/arm_neon.td
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/arm_neon.td?rev=345344&r1=345343&r2=345344&view=diff
==============================================================================
--- cfe/trunk/include/clang/Basic/arm_neon.td (original)
+++ cfe/trunk/include/clang/Basic/arm_neon.td Thu Oct 25 16:47:00 2018
@@ -206,6 +206,15 @@ def OP_DOT_LNQ
: Op<(call "vdot", $p0, $p1,
(bitcast $p1, (splat(bitcast "uint32x4_t", $p2), $p3)))>;
+def OP_FMLAL_LN : Op<(call "vfmlal_low", $p0, $p1,
+ (dup_typed $p1, (call "vget_lane", $p2, $p3)))>;
+def OP_FMLSL_LN : Op<(call "vfmlsl_low", $p0, $p1,
+ (dup_typed $p1, (call "vget_lane", $p2, $p3)))>;
+def OP_FMLAL_LN_Hi : Op<(call "vfmlal_high", $p0, $p1,
+ (dup_typed $p1, (call "vget_lane", $p2, $p3)))>;
+def OP_FMLSL_LN_Hi : Op<(call "vfmlsl_high", $p0, $p1,
+ (dup_typed $p1, (call "vget_lane", $p2, $p3)))>;
+
//===----------------------------------------------------------------------===//
// Instructions
//===----------------------------------------------------------------------===//
@@ -1640,3 +1649,21 @@ let ArchGuard = "defined(__ARM_FEATURE_D
// Variants indexing into a 128-bit vector are A64 only.
def UDOT_LANEQ : SOpInst<"vdot_laneq", "dd89i", "iUiQiQUi", OP_DOT_LNQ>;
}
+
+// v8.2-A FP16 fused multiply-add long instructions.
+let ArchGuard = "defined(__ARM_FEATURE_FP16FML) && defined(__aarch64__)" in {
+ def VFMLAL_LOW : SInst<"vfmlal_low", "ffHH", "UiQUi">;
+ def VFMLSL_LOW : SInst<"vfmlsl_low", "ffHH", "UiQUi">;
+ def VFMLAL_HIGH : SInst<"vfmlal_high", "ffHH", "UiQUi">;
+ def VFMLSL_HIGH : SInst<"vfmlsl_high", "ffHH", "UiQUi">;
+
+ def VFMLAL_LANE_LOW : SOpInst<"vfmlal_lane_low", "ffH0i", "UiQUi", OP_FMLAL_LN>;
+ def VFMLSL_LANE_LOW : SOpInst<"vfmlsl_lane_low", "ffH0i", "UiQUi", OP_FMLSL_LN>;
+ def VFMLAL_LANE_HIGH : SOpInst<"vfmlal_lane_high", "ffH0i", "UiQUi", OP_FMLAL_LN_Hi>;
+ def VFMLSL_LANE_HIGH : SOpInst<"vfmlsl_lane_high", "ffH0i", "UiQUi", OP_FMLSL_LN_Hi>;
+
+ def VFMLAL_LANEQ_LOW : SOpInst<"vfmlal_laneq_low", "ffH1i", "UiQUi", OP_FMLAL_LN>;
+ def VFMLSL_LANEQ_LOW : SOpInst<"vfmlsl_laneq_low", "ffH1i", "UiQUi", OP_FMLSL_LN>;
+ def VFMLAL_LANEQ_HIGH : SOpInst<"vfmlal_laneq_high", "ffH1i", "UiQUi", OP_FMLAL_LN_Hi>;
+ def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "ffH1i", "UiQUi", OP_FMLSL_LN_Hi>;
+}
Modified: cfe/trunk/include/clang/Basic/arm_neon_incl.td
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/arm_neon_incl.td?rev=345344&r1=345343&r2=345344&view=diff
==============================================================================
--- cfe/trunk/include/clang/Basic/arm_neon_incl.td (original)
+++ cfe/trunk/include/clang/Basic/arm_neon_incl.td Thu Oct 25 16:47:00 2018
@@ -96,6 +96,11 @@ def bitcast;
// example: (dup $p1) -> "(uint32x2_t) {__p1, __p1}" (assuming the base type
// is uint32x2_t).
def dup;
+// dup_typed - Take a vector and a scalar argument, and create a new vector of
+// the same type by duplicating the scalar value into all lanes.
+// example: (dup_typed $p1, $p2) -> "(float16x4_t) {__p2, __p2, __p2, __p2}"
+// (assuming __p1 is float16x4_t, and __p2 is a compatible scalar).
+def dup_typed;
// splat - Take a vector and a lane index, and return a vector of the same type
// containing repeated instances of the source vector at the lane index.
// example: (splat $p0, $p1) ->
@@ -229,6 +234,8 @@ def OP_UNAVAILABLE : Operation {
// f: float (int args)
// F: double (int args)
// H: half (int args)
+// 0: half (int args), ignore 'Q' size modifier.
+// 1: half (int args), force 'Q' size modifier.
// d: default
// g: default, ignore 'Q' size modifier.
// j: default, force 'Q' size modifier.
Modified: cfe/trunk/lib/Basic/Targets/AArch64.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/Targets/AArch64.cpp?rev=345344&r1=345343&r2=345344&view=diff
==============================================================================
--- cfe/trunk/lib/Basic/Targets/AArch64.cpp (original)
+++ cfe/trunk/lib/Basic/Targets/AArch64.cpp Thu Oct 25 16:47:00 2018
@@ -194,6 +194,9 @@ void AArch64TargetInfo::getTargetDefines
if (HasDotProd)
Builder.defineMacro("__ARM_FEATURE_DOTPROD", "1");
+ if ((FPU & NeonMode) && HasFP16FML)
+ Builder.defineMacro("__ARM_FEATURE_FP16FML", "1");
+
switch (ArchKind) {
default:
break;
@@ -231,6 +234,7 @@ bool AArch64TargetInfo::handleTargetFeat
Unaligned = 1;
HasFullFP16 = 0;
HasDotProd = 0;
+ HasFP16FML = 0;
ArchKind = llvm::AArch64::ArchKind::ARMV8A;
for (const auto &Feature : Features) {
@@ -252,6 +256,8 @@ bool AArch64TargetInfo::handleTargetFeat
HasFullFP16 = 1;
if (Feature == "+dotprod")
HasDotProd = 1;
+ if (Feature == "+fp16fml")
+ HasFP16FML = 1;
}
setDataLayout();
Modified: cfe/trunk/lib/Basic/Targets/AArch64.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/Targets/AArch64.h?rev=345344&r1=345343&r2=345344&view=diff
==============================================================================
--- cfe/trunk/lib/Basic/Targets/AArch64.h (original)
+++ cfe/trunk/lib/Basic/Targets/AArch64.h Thu Oct 25 16:47:00 2018
@@ -34,6 +34,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64Tar
unsigned Unaligned;
unsigned HasFullFP16;
unsigned HasDotProd;
+ unsigned HasFP16FML;
llvm::AArch64::ArchKind ArchKind;
static const Builtin::Info BuiltinInfo[];
Modified: cfe/trunk/lib/CodeGen/CGBuiltin.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGBuiltin.cpp?rev=345344&r1=345343&r2=345344&view=diff
==============================================================================
--- cfe/trunk/lib/CodeGen/CGBuiltin.cpp (original)
+++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp Thu Oct 25 16:47:00 2018
@@ -4368,6 +4368,14 @@ static const NeonIntrinsicInfo AArch64SI
NEONMAP0(vextq_v),
NEONMAP0(vfma_v),
NEONMAP0(vfmaq_v),
+ NEONMAP1(vfmlal_high_v, aarch64_neon_fmlal2, 0),
+ NEONMAP1(vfmlal_low_v, aarch64_neon_fmlal, 0),
+ NEONMAP1(vfmlalq_high_v, aarch64_neon_fmlal2, 0),
+ NEONMAP1(vfmlalq_low_v, aarch64_neon_fmlal, 0),
+ NEONMAP1(vfmlsl_high_v, aarch64_neon_fmlsl2, 0),
+ NEONMAP1(vfmlsl_low_v, aarch64_neon_fmlsl, 0),
+ NEONMAP1(vfmlslq_high_v, aarch64_neon_fmlsl2, 0),
+ NEONMAP1(vfmlslq_low_v, aarch64_neon_fmlsl, 0),
NEONMAP2(vhadd_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
NEONMAP2(vhaddq_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
NEONMAP2(vhsub_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
@@ -5341,6 +5349,34 @@ Value *CodeGenFunction::EmitCommonNeonBu
Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vdot");
}
+ case NEON::BI__builtin_neon_vfmlal_low_v:
+ case NEON::BI__builtin_neon_vfmlalq_low_v: {
+ llvm::Type *InputTy =
+ llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
+ llvm::Type *Tys[2] = { Ty, InputTy };
+ return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_low");
+ }
+ case NEON::BI__builtin_neon_vfmlsl_low_v:
+ case NEON::BI__builtin_neon_vfmlslq_low_v: {
+ llvm::Type *InputTy =
+ llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
+ llvm::Type *Tys[2] = { Ty, InputTy };
+ return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_low");
+ }
+ case NEON::BI__builtin_neon_vfmlal_high_v:
+ case NEON::BI__builtin_neon_vfmlalq_high_v: {
+ llvm::Type *InputTy =
+ llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
+ llvm::Type *Tys[2] = { Ty, InputTy };
+ return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_high");
+ }
+ case NEON::BI__builtin_neon_vfmlsl_high_v:
+ case NEON::BI__builtin_neon_vfmlslq_high_v: {
+ llvm::Type *InputTy =
+ llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
+ llvm::Type *Tys[2] = { Ty, InputTy };
+ return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_high");
+ }
}
assert(Int && "Expected valid intrinsic number");
Added: cfe/trunk/test/CodeGen/aarch64-neon-fp16fml.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/aarch64-neon-fp16fml.c?rev=345344&view=auto
==============================================================================
--- cfe/trunk/test/CodeGen/aarch64-neon-fp16fml.c (added)
+++ cfe/trunk/test/CodeGen/aarch64-neon-fp16fml.c Thu Oct 25 16:47:00 2018
@@ -0,0 +1,196 @@
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +v8.2a -target-feature +neon -target-feature +fp16fml \
+// RUN: -fallow-half-arguments-and-returns -disable-O0-optnone -emit-llvm -o - %s | opt -S -instcombine | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+// Test AArch64 Armv8.2-A FP16 Fused Multiply-Add Long intrinsics
+
+#include <arm_neon.h>
+
+// Vector form
+
+float32x2_t test_vfmlal_low_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlal_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: ret <2 x float> [[RESULT]]
+ return vfmlal_low_u32(a, b, c);
+}
+
+float32x2_t test_vfmlsl_low_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlsl_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: ret <2 x float> [[RESULT]]
+ return vfmlsl_low_u32(a, b, c);
+}
+
+float32x2_t test_vfmlal_high_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlal_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: ret <2 x float> [[RESULT]]
+ return vfmlal_high_u32(a, b, c);
+}
+
+float32x2_t test_vfmlsl_high_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlsl_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: ret <2 x float> [[RESULT]]
+ return vfmlsl_high_u32(a, b, c);
+}
+
+float32x4_t test_vfmlalq_low_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlalq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: ret <4 x float> [[RESULT]]
+ return vfmlalq_low_u32(a, b, c);
+}
+
+float32x4_t test_vfmlslq_low_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlslq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: ret <4 x float> [[RESULT]]
+ return vfmlslq_low_u32(a, b, c);
+}
+
+float32x4_t test_vfmlalq_high_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlalq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: ret <4 x float> [[RESULT]]
+ return vfmlalq_high_u32(a, b, c);
+}
+
+float32x4_t test_vfmlslq_high_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlslq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: ret <4 x float> [[RESULT]]
+ return vfmlslq_high_u32(a, b, c);
+}
+
+// Indexed form
+
+float32x2_t test_vfmlal_lane_low_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlal_lane_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> zeroinitializer
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+ return vfmlal_lane_low_u32(a, b, c, 0);
+}
+
+float32x2_t test_vfmlal_lane_high_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlal_lane_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+ return vfmlal_lane_high_u32(a, b, c, 1);
+}
+
+float32x4_t test_vfmlalq_lane_low_u32(float32x4_t a, float16x8_t b, float16x4_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlalq_lane_low_u32(<4 x float> %a, <8 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+ return vfmlalq_lane_low_u32(a, b, c, 2);
+}
+
+float32x4_t test_vfmlalq_lane_high_u32(float32x4_t a, float16x8_t b, float16x4_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlalq_lane_high_u32(<4 x float> %a, <8 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+ return vfmlalq_lane_high_u32(a, b, c, 3);
+}
+
+float32x2_t test_vfmlal_laneq_low_u32(float32x2_t a, float16x4_t b, float16x8_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlal_laneq_low_u32(<2 x float> %a, <4 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+ return vfmlal_laneq_low_u32(a, b, c, 4);
+}
+
+float32x2_t test_vfmlal_laneq_high_u32(float32x2_t a, float16x4_t b, float16x8_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlal_laneq_high_u32(<2 x float> %a, <4 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+ return vfmlal_laneq_high_u32(a, b, c, 5);
+}
+
+float32x4_t test_vfmlalq_laneq_low_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlalq_laneq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+ return vfmlalq_laneq_low_u32(a, b, c, 6);
+}
+
+float32x4_t test_vfmlalq_laneq_high_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlalq_laneq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+ return vfmlalq_laneq_high_u32(a, b, c, 7);
+}
+
+float32x2_t test_vfmlsl_lane_low_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlsl_lane_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> zeroinitializer
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+ return vfmlsl_lane_low_u32(a, b, c, 0);
+}
+
+float32x2_t test_vfmlsl_lane_high_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlsl_lane_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+ return vfmlsl_lane_high_u32(a, b, c, 1);
+}
+
+float32x4_t test_vfmlslq_lane_low_u32(float32x4_t a, float16x8_t b, float16x4_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlslq_lane_low_u32(<4 x float> %a, <8 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+ return vfmlslq_lane_low_u32(a, b, c, 2);
+}
+
+float32x4_t test_vfmlslq_lane_high_u32(float32x4_t a, float16x8_t b, float16x4_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlslq_lane_high_u32(<4 x float> %a, <8 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+ return vfmlslq_lane_high_u32(a, b, c, 3);
+}
+
+float32x2_t test_vfmlsl_laneq_low_u32(float32x2_t a, float16x4_t b, float16x8_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlsl_laneq_low_u32(<2 x float> %a, <4 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+ return vfmlsl_laneq_low_u32(a, b, c, 4);
+}
+
+float32x2_t test_vfmlsl_laneq_high_u32(float32x2_t a, float16x4_t b, float16x8_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlsl_laneq_high_u32(<2 x float> %a, <4 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+ return vfmlsl_laneq_high_u32(a, b, c, 5);
+}
+
+float32x4_t test_vfmlslq_laneq_low_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlslq_laneq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+ return vfmlslq_laneq_low_u32(a, b, c, 6);
+}
+
+float32x4_t test_vfmlslq_laneq_high_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlslq_laneq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+ return vfmlslq_laneq_high_u32(a, b, c, 7);
+}
Modified: cfe/trunk/test/Preprocessor/aarch64-target-features.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Preprocessor/aarch64-target-features.c?rev=345344&r1=345343&r2=345344&view=diff
==============================================================================
--- cfe/trunk/test/Preprocessor/aarch64-target-features.c (original)
+++ cfe/trunk/test/Preprocessor/aarch64-target-features.c Thu Oct 25 16:47:00 2018
@@ -93,16 +93,20 @@
// RUN: %clang -target aarch64-none-linux-gnu -march=armv8.2a+dotprod -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-DOTPROD %s
// CHECK-DOTPROD: __ARM_FEATURE_DOTPROD 1
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// On ARMv8.2-A and above, +fp16fml implies +fp16.
+// On ARMv8.4-A and above, +fp16 implies +fp16fml.
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// CHECK-FULLFP16-FML: #define __ARM_FEATURE_FP16FML 1
+// CHECK-FULLFP16-NOFML-NOT: #define __ARM_FEATURE_FP16FML 1
// CHECK-FULLFP16-VECTOR-SCALAR: #define __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1
// CHECK-FULLFP16-VECTOR-SCALAR: #define __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1
// CHECK-FULLFP16-VECTOR-SCALAR: #define __ARM_FP 0xE
@@ -114,6 +118,7 @@
// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16fml+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
+// CHECK-FULLFP16-SCALAR-NOT: #define __ARM_FEATURE_FP16FML 1
// CHECK-FULLFP16-SCALAR: #define __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1
// CHECK-FULLFP16-SCALAR-NOT: #define __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1
// CHECK-FULLFP16-SCALAR: #define __ARM_FP 0xE
@@ -127,10 +132,11 @@
// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16fml+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
+// CHECK-FULLFP16-NOFML-VECTOR-SCALAR-NOT: #define __ARM_FEATURE_FP16FML 1
// CHECK-FULLFP16-NOFML-VECTOR-SCALAR-NOT: #define __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1
// CHECK-FULLFP16-NOFML-VECTOR-SCALAR-NOT: #define __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1
-// CHECK-FULLFP16-NOFML-VECTOR-SCALAR: #define __ARM_FP 0xE
-// CHECK-FULLFP16-NOFML-VECTOR-SCALAR: #define __ARM_FP16_FORMAT_IEEE 1
+// CHECK-FULLFP16-NOFML-VECTOR-SCALAR: #define __ARM_FP 0xE
+// CHECK-FULLFP16-NOFML-VECTOR-SCALAR: #define __ARM_FP16_FORMAT_IEEE 1
// ================== Check whether -mtune accepts mixed-case features.
// RUN: %clang -target aarch64 -mtune=CYCLONE -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MTUNE-CYCLONE %s
Modified: cfe/trunk/utils/TableGen/NeonEmitter.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/utils/TableGen/NeonEmitter.cpp?rev=345344&r1=345343&r2=345344&view=diff
==============================================================================
--- cfe/trunk/utils/TableGen/NeonEmitter.cpp (original)
+++ cfe/trunk/utils/TableGen/NeonEmitter.cpp Thu Oct 25 16:47:00 2018
@@ -494,6 +494,7 @@ private:
std::pair<Type, std::string> emitDagSaveTemp(DagInit *DI);
std::pair<Type, std::string> emitDagSplat(DagInit *DI);
std::pair<Type, std::string> emitDagDup(DagInit *DI);
+ std::pair<Type, std::string> emitDagDupTyped(DagInit *DI);
std::pair<Type, std::string> emitDagShuffle(DagInit *DI);
std::pair<Type, std::string> emitDagCast(DagInit *DI, bool IsBitCast);
std::pair<Type, std::string> emitDagCall(DagInit *DI);
@@ -897,6 +898,18 @@ void Type::applyModifier(char Mod) {
Float = true;
ElementBitwidth = 16;
break;
+ case '0':
+ Float = true;
+ if (AppliedQuad)
+ Bitwidth /= 2;
+ ElementBitwidth = 16;
+ break;
+ case '1':
+ Float = true;
+ if (!AppliedQuad)
+ Bitwidth *= 2;
+ ElementBitwidth = 16;
+ break;
case 'g':
if (AppliedQuad)
Bitwidth /= 2;
@@ -1507,6 +1520,8 @@ std::pair<Type, std::string> Intrinsic::
return emitDagShuffle(DI);
if (Op == "dup")
return emitDagDup(DI);
+ if (Op == "dup_typed")
+ return emitDagDupTyped(DI);
if (Op == "splat")
return emitDagSplat(DI);
if (Op == "save_temp")
@@ -1768,6 +1783,28 @@ std::pair<Type, std::string> Intrinsic::
}
S += "}";
+ return std::make_pair(T, S);
+}
+
+std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagDupTyped(DagInit *DI) {
+ assert_with_loc(DI->getNumArgs() == 2, "dup_typed() expects two arguments");
+ std::pair<Type, std::string> A = emitDagArg(DI->getArg(0),
+ DI->getArgNameStr(0));
+ std::pair<Type, std::string> B = emitDagArg(DI->getArg(1),
+ DI->getArgNameStr(1));
+ assert_with_loc(B.first.isScalar(),
+ "dup_typed() requires a scalar as the second argument");
+
+ Type T = A.first;
+ assert_with_loc(T.isVector(), "dup_typed() used but target type is scalar!");
+ std::string S = "(" + T.str() + ") {";
+ for (unsigned I = 0; I < T.getNumElements(); ++I) {
+ if (I != 0)
+ S += ", ";
+ S += B.second;
+ }
+ S += "}";
+
return std::make_pair(T, S);
}
More information about the cfe-commits
mailing list