[clang] dcf11c5 - [ARM][AArch64] Complex addition Neon intrinsics for Armv8.3-A
Victor Campos via cfe-commits
cfe-commits at lists.llvm.org
Mon Dec 2 06:38:52 PST 2019
Author: Victor Campos
Date: 2019-12-02T14:38:39Z
New Revision: dcf11c5e86cee94ec649a7a31c5dd259f60579d6
URL: https://github.com/llvm/llvm-project/commit/dcf11c5e86cee94ec649a7a31c5dd259f60579d6
DIFF: https://github.com/llvm/llvm-project/commit/dcf11c5e86cee94ec649a7a31c5dd259f60579d6.diff
LOG: [ARM][AArch64] Complex addition Neon intrinsics for Armv8.3-A
Summary:
Add support for vcadd_* family of intrinsics. This set of intrinsics is
available in Armv8.3-A.
The fp16 versions require the FP16 extension, which has been available
(opt-in) since Armv8.2-A.
Reviewers: t.p.northover
Reviewed By: t.p.northover
Subscribers: t.p.northover, kristof.beyls, hiraditya, cfe-commits, llvm-commits
Tags: #clang, #llvm
Differential Revision: https://reviews.llvm.org/D70862
Added:
clang/test/CodeGen/aarch64-neon-vcadd.c
clang/test/CodeGen/arm-neon-vcadd.c
llvm/test/CodeGen/AArch64/neon-vcadd.ll
llvm/test/CodeGen/ARM/neon-vcadd.ll
Modified:
clang/include/clang/Basic/arm_neon.td
clang/lib/Basic/Targets/AArch64.cpp
clang/lib/Basic/Targets/ARM.cpp
clang/lib/Basic/Targets/ARM.h
clang/lib/CodeGen/CGBuiltin.cpp
llvm/include/llvm/IR/IntrinsicsAArch64.td
llvm/include/llvm/IR/IntrinsicsARM.td
llvm/lib/Target/AArch64/AArch64InstrInfo.td
llvm/lib/Target/ARM/ARMInstrNEON.td
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index b5e395c8103f..a4dc21b64311 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -1673,3 +1673,21 @@ let ArchGuard = "defined(__ARM_FEATURE_FP16FML) && defined(__aarch64__)" in {
def VFMLAL_LANEQ_HIGH : SOpInst<"vfmlal_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN_Hi>;
def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi>;
}
+
+// v8.3-A Vector complex addition intrinsics
+let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)" in {
+ def VCADD_ROT90_FP16 : SInst<"vcadd_rot90", "...", "h">;
+ def VCADD_ROT270_FP16 : SInst<"vcadd_rot270", "...", "h">;
+ def VCADDQ_ROT90_FP16 : SInst<"vcaddq_rot90", "QQQ", "h">;
+ def VCADDQ_ROT270_FP16 : SInst<"vcaddq_rot270", "QQQ", "h">;
+}
+let ArchGuard = "defined(__ARM_FEATURE_COMPLEX)" in {
+ def VCADD_ROT90 : SInst<"vcadd_rot90", "...", "f">;
+ def VCADD_ROT270 : SInst<"vcadd_rot270", "...", "f">;
+ def VCADDQ_ROT90 : SInst<"vcaddq_rot90", "QQQ", "f">;
+ def VCADDQ_ROT270 : SInst<"vcaddq_rot270", "QQQ", "f">;
+}
+let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__aarch64__)" in {
+ def VCADDQ_ROT90_FP64 : SInst<"vcaddq_rot90", "QQQ", "d">;
+ def VCADDQ_ROT270_FP64 : SInst<"vcaddq_rot270", "QQQ", "d">;
+}
\ No newline at end of file
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 5214f7c30ee0..cba3e3ada7ea 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -158,6 +158,7 @@ void AArch64TargetInfo::getTargetDefinesARMV82A(const LangOptions &Opts,
void AArch64TargetInfo::getTargetDefinesARMV83A(const LangOptions &Opts,
MacroBuilder &Builder) const {
+ Builder.defineMacro("__ARM_FEATURE_COMPLEX", "1");
Builder.defineMacro("__ARM_FEATURE_JCVT", "1");
// Also include the Armv8.2 defines
getTargetDefinesARMV82A(Opts, Builder);
diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp
index 437a77afdc99..be088e81cffe 100644
--- a/clang/lib/Basic/Targets/ARM.cpp
+++ b/clang/lib/Basic/Targets/ARM.cpp
@@ -580,6 +580,13 @@ void ARMTargetInfo::getTargetDefinesARMV82A(const LangOptions &Opts,
getTargetDefinesARMV81A(Opts, Builder);
}
+void ARMTargetInfo::getTargetDefinesARMV83A(const LangOptions &Opts,
+ MacroBuilder &Builder) const {
+ // Also include the ARMv8.2-A defines
+ Builder.defineMacro("__ARM_FEATURE_COMPLEX", "1");
+ getTargetDefinesARMV82A(Opts, Builder);
+}
+
void ARMTargetInfo::getTargetDefines(const LangOptions &Opts,
MacroBuilder &Builder) const {
// Target identification.
@@ -809,6 +816,11 @@ void ARMTargetInfo::getTargetDefines(const LangOptions &Opts,
case llvm::ARM::ArchKind::ARMV8_2A:
getTargetDefinesARMV82A(Opts, Builder);
break;
+ case llvm::ARM::ArchKind::ARMV8_3A:
+ case llvm::ARM::ArchKind::ARMV8_4A:
+ case llvm::ARM::ArchKind::ARMV8_5A:
+ getTargetDefinesARMV83A(Opts, Builder);
+ break;
}
}
diff --git a/clang/lib/Basic/Targets/ARM.h b/clang/lib/Basic/Targets/ARM.h
index ce87a6265934..9696a4404589 100644
--- a/clang/lib/Basic/Targets/ARM.h
+++ b/clang/lib/Basic/Targets/ARM.h
@@ -148,9 +148,10 @@ class LLVM_LIBRARY_VISIBILITY ARMTargetInfo : public TargetInfo {
void getTargetDefinesARMV81A(const LangOptions &Opts,
MacroBuilder &Builder) const;
-
void getTargetDefinesARMV82A(const LangOptions &Opts,
MacroBuilder &Builder) const;
+ void getTargetDefinesARMV83A(const LangOptions &Opts,
+ MacroBuilder &Builder) const;
void getTargetDefines(const LangOptions &Opts,
MacroBuilder &Builder) const override;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 26044f53e496..68706d78cd11 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4454,6 +4454,10 @@ static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = {
NEONMAP1(vaesmcq_v, arm_neon_aesmc, 0),
NEONMAP1(vbsl_v, arm_neon_vbsl, AddRetType),
NEONMAP1(vbslq_v, arm_neon_vbsl, AddRetType),
+ NEONMAP1(vcadd_rot270_v, arm_neon_vcadd_rot270, Add1ArgType),
+ NEONMAP1(vcadd_rot90_v, arm_neon_vcadd_rot90, Add1ArgType),
+ NEONMAP1(vcaddq_rot270_v, arm_neon_vcadd_rot270, Add1ArgType),
+ NEONMAP1(vcaddq_rot90_v, arm_neon_vcadd_rot90, Add1ArgType),
NEONMAP1(vcage_v, arm_neon_vacge, 0),
NEONMAP1(vcageq_v, arm_neon_vacge, 0),
NEONMAP1(vcagt_v, arm_neon_vacgt, 0),
@@ -4727,6 +4731,10 @@ static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
NEONMAP1(vaeseq_v, aarch64_crypto_aese, 0),
NEONMAP1(vaesimcq_v, aarch64_crypto_aesimc, 0),
NEONMAP1(vaesmcq_v, aarch64_crypto_aesmc, 0),
+ NEONMAP1(vcadd_rot270_v, aarch64_neon_vcadd_rot270, Add1ArgType),
+ NEONMAP1(vcadd_rot90_v, aarch64_neon_vcadd_rot90, Add1ArgType),
+ NEONMAP1(vcaddq_rot270_v, aarch64_neon_vcadd_rot270, Add1ArgType),
+ NEONMAP1(vcaddq_rot90_v, aarch64_neon_vcadd_rot90, Add1ArgType),
NEONMAP1(vcage_v, aarch64_neon_facge, 0),
NEONMAP1(vcageq_v, aarch64_neon_facge, 0),
NEONMAP1(vcagt_v, aarch64_neon_facgt, 0),
diff --git a/clang/test/CodeGen/aarch64-neon-vcadd.c b/clang/test/CodeGen/aarch64-neon-vcadd.c
new file mode 100644
index 000000000000..6f1b3dcd4015
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-neon-vcadd.c
@@ -0,0 +1,63 @@
+// RUN: %clang -target aarch64-arm-none-eabi -march=armv8.3-a+fp16 %s -S -emit-llvm -o - | FileCheck %s
+
+#include <arm_neon.h>
+
+void foo16x4_rot90(float16x4_t a, float16x4_t b)
+{
+// CHECK: call <4 x half> @llvm.aarch64.neon.vcadd.rot90.v4f16
+ float16x4_t result = vcadd_rot90_f16(a, b);
+}
+
+void foo32x2_rot90(float32x2_t a, float32x2_t b)
+{
+// CHECK: call <2 x float> @llvm.aarch64.neon.vcadd.rot90.v2f32
+ float32x2_t result = vcadd_rot90_f32(a, b);
+}
+
+void foo16x8_rot90(float16x8_t a, float16x8_t b)
+{
+// CHECK: call <8 x half> @llvm.aarch64.neon.vcadd.rot90.v8f16
+ float16x8_t result = vcaddq_rot90_f16(a, b);
+}
+
+void foo32x4_rot90(float32x4_t a, float32x4_t b)
+{
+// CHECK: call <4 x float> @llvm.aarch64.neon.vcadd.rot90.v4f32
+ float32x4_t result = vcaddq_rot90_f32(a, b);
+}
+
+void foo64x2_rot90(float64x2_t a, float64x2_t b)
+{
+// CHECK: call <2 x double> @llvm.aarch64.neon.vcadd.rot90.v2f64
+ float64x2_t result = vcaddq_rot90_f64(a, b);
+}
+
+void foo16x4_rot270(float16x4_t a, float16x4_t b)
+{
+// CHECK: call <4 x half> @llvm.aarch64.neon.vcadd.rot270.v4f16
+ float16x4_t result = vcadd_rot270_f16(a, b);
+}
+
+void foo32x2_rot270(float32x2_t a, float32x2_t b)
+{
+// CHECK: call <2 x float> @llvm.aarch64.neon.vcadd.rot270.v2f32
+ float32x2_t result = vcadd_rot270_f32(a, b);
+}
+
+void foo16x8_rot270(float16x8_t a, float16x8_t b)
+{
+// CHECK: call <8 x half> @llvm.aarch64.neon.vcadd.rot270.v8f16
+ float16x8_t result = vcaddq_rot270_f16(a, b);
+}
+
+void foo32x4_rot270(float32x4_t a, float32x4_t b)
+{
+// CHECK: call <4 x float> @llvm.aarch64.neon.vcadd.rot270.v4f32
+ float32x4_t result = vcaddq_rot270_f32(a, b);
+}
+
+void foo64x2_rot270(float64x2_t a, float64x2_t b)
+{
+// CHECK: call <2 x double> @llvm.aarch64.neon.vcadd.rot270.v2f64
+ float64x2_t result = vcaddq_rot270_f64(a, b);
+}
diff --git a/clang/test/CodeGen/arm-neon-vcadd.c b/clang/test/CodeGen/arm-neon-vcadd.c
new file mode 100644
index 000000000000..4b9cf34c486c
--- /dev/null
+++ b/clang/test/CodeGen/arm-neon-vcadd.c
@@ -0,0 +1,51 @@
+// RUN: %clang -target arm-arm-none-eabi -march=armv8.3-a+fp16 %s -S -emit-llvm -o - | opt -S -sroa | FileCheck %s
+
+#include <arm_neon.h>
+
+void foo16x4_rot90(float16x4_t a, float16x4_t b)
+{
+// CHECK: call <4 x half> @llvm.arm.neon.vcadd.rot90.v4f16
+ float16x4_t result = vcadd_rot90_f16(a, b);
+}
+
+void foo32x2_rot90(float32x2_t a, float32x2_t b)
+{
+// CHECK: call <2 x float> @llvm.arm.neon.vcadd.rot90.v2f32
+ float32x2_t result = vcadd_rot90_f32(a, b);
+}
+
+void foo16x8_rot90(float16x8_t a, float16x8_t b)
+{
+// CHECK: call <8 x half> @llvm.arm.neon.vcadd.rot90.v8f16
+ float16x8_t result = vcaddq_rot90_f16(a, b);
+}
+
+void foo32x4_rot90(float32x4_t a, float32x4_t b)
+{
+// CHECK: call <4 x float> @llvm.arm.neon.vcadd.rot90.v4f32
+ float32x4_t result = vcaddq_rot90_f32(a, b);
+}
+
+void foo16x4_rot270(float16x4_t a, float16x4_t b)
+{
+// CHECK: call <4 x half> @llvm.arm.neon.vcadd.rot270.v4f16
+ float16x4_t result = vcadd_rot270_f16(a, b);
+}
+
+void foo32x2_rot270(float32x2_t a, float32x2_t b)
+{
+// CHECK: call <2 x float> @llvm.arm.neon.vcadd.rot270.v2f32
+ float32x2_t result = vcadd_rot270_f32(a, b);
+}
+
+void foo16x8_rot270(float16x8_t a, float16x8_t b)
+{
+// CHECK: call <8 x half> @llvm.arm.neon.vcadd.rot270.v8f16
+ float16x8_t result = vcaddq_rot270_f16(a, b);
+}
+
+void foo32x4_rot270(float32x4_t a, float32x4_t b)
+{
+// CHECK: call <4 x float> @llvm.arm.neon.vcadd.rot270.v4f32
+ float32x4_t result = vcaddq_rot270_f32(a, b);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index c74b17c98895..d22993300eb2 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -446,6 +446,10 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;
def int_aarch64_neon_fmlal2 : AdvSIMD_FP16FML_Intrinsic;
def int_aarch64_neon_fmlsl2 : AdvSIMD_FP16FML_Intrinsic;
+
+ // v8.3-A Floating-point complex add
+ def int_aarch64_neon_vcadd_rot90 : AdvSIMD_2VectorArg_Intrinsic;
+ def int_aarch64_neon_vcadd_rot270 : AdvSIMD_2VectorArg_Intrinsic;
}
let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index 4d4bc5e2d82d..337110b4917b 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -778,6 +778,10 @@ def int_arm_vctp16 : Intrinsic<[llvm_v8i1_ty], [llvm_i32_ty], [IntrNoMem]>;
def int_arm_vctp32 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>;
def int_arm_vctp64 : Intrinsic<[llvm_v2i1_ty], [llvm_i32_ty], [IntrNoMem]>;
+// v8.3-A Floating-point complex add
+def int_arm_neon_vcadd_rot90 : Neon_2Arg_Intrinsic;
+def int_arm_neon_vcadd_rot270 : Neon_2Arg_Intrinsic;
+
// GNU eabi mcount
def int_arm_gnu_eabi_mcount : Intrinsic<[],
[],
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index ec84c1efbaf1..d619362e029e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -757,6 +757,29 @@ defm FCADD : SIMDThreeSameVectorComplexHSD<1, 0b111, complexrotateopodd,
defm FCMLA : SIMDIndexedTiedComplexHSD<1, 0, 1, complexrotateop, "fcmla",
null_frag>;
+let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
+ def : Pat<(v4f16 (int_aarch64_neon_vcadd_rot90 (v4f16 V64:$Rn), (v4f16 V64:$Rm))),
+ (FCADDv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm), (i32 0))>;
+ def : Pat<(v4f16 (int_aarch64_neon_vcadd_rot270 (v4f16 V64:$Rn), (v4f16 V64:$Rm))),
+ (FCADDv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm), (i32 1))>;
+ def : Pat<(v8f16 (int_aarch64_neon_vcadd_rot90 (v8f16 V128:$Rn), (v8f16 V128:$Rm))),
+ (FCADDv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm), (i32 0))>;
+ def : Pat<(v8f16 (int_aarch64_neon_vcadd_rot270 (v8f16 V128:$Rn), (v8f16 V128:$Rm))),
+ (FCADDv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm), (i32 1))>;
+}
+let Predicates = [HasComplxNum, HasNEON] in {
+ def : Pat<(v2f32 (int_aarch64_neon_vcadd_rot90 (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
+ (FCADDv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm), (i32 0))>;
+ def : Pat<(v2f32 (int_aarch64_neon_vcadd_rot270 (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
+ (FCADDv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm), (i32 1))>;
+ foreach Ty = [v4f32, v2f64] in {
+ def : Pat<(Ty (int_aarch64_neon_vcadd_rot90 (Ty V128:$Rn), (Ty V128:$Rm))),
+ (!cast<Instruction>("FCADD"#Ty) (Ty V128:$Rn), (Ty V128:$Rm), (i32 0))>;
+ def : Pat<(Ty (int_aarch64_neon_vcadd_rot270 (Ty V128:$Rn), (Ty V128:$Rm))),
+ (!cast<Instruction>("FCADD"#Ty) (Ty V128:$Rn), (Ty V128:$Rm), (i32 1))>;
+ }
+}
+
// v8.3a Pointer Authentication
// These instructions inhabit part of the hint space and so can be used for
// armv8 targets
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index 1653ce1275cf..0e2b3a08b3ec 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -5012,6 +5012,27 @@ defm VCMLA : N3VCP8ComplexTied<1, 0, "vcmla", null_frag>;
defm VCADD : N3VCP8ComplexOdd<1, 0, 0, "vcadd", null_frag>;
defm VCMLA : N3VCP8ComplexTiedLane<0, "vcmla", null_frag>;
+let Predicates = [HasNEON,HasV8_3a,HasFullFP16] in {
+ def : Pat<(v4f16 (int_arm_neon_vcadd_rot90 (v4f16 DPR:$Rn), (v4f16 DPR:$Rm))),
+ (VCADDv4f16 (v4f16 DPR:$Rn), (v4f16 DPR:$Rm), (i32 0))>;
+ def : Pat<(v4f16 (int_arm_neon_vcadd_rot270 (v4f16 DPR:$Rn), (v4f16 DPR:$Rm))),
+ (VCADDv4f16 (v4f16 DPR:$Rn), (v4f16 DPR:$Rm), (i32 1))>;
+ def : Pat<(v8f16 (int_arm_neon_vcadd_rot90 (v8f16 QPR:$Rn), (v8f16 QPR:$Rm))),
+ (VCADDv8f16 (v8f16 QPR:$Rn), (v8f16 QPR:$Rm), (i32 0))>;
+ def : Pat<(v8f16 (int_arm_neon_vcadd_rot270 (v8f16 QPR:$Rn), (v8f16 QPR:$Rm))),
+ (VCADDv8f16 (v8f16 QPR:$Rn), (v8f16 QPR:$Rm), (i32 1))>;
+}
+let Predicates = [HasNEON,HasV8_3a] in {
+ def : Pat<(v2f32 (int_arm_neon_vcadd_rot90 (v2f32 DPR:$Rn), (v2f32 DPR:$Rm))),
+ (VCADDv2f32 (v2f32 DPR:$Rn), (v2f32 DPR:$Rm), (i32 0))>;
+ def : Pat<(v2f32 (int_arm_neon_vcadd_rot270 (v2f32 DPR:$Rn), (v2f32 DPR:$Rm))),
+ (VCADDv2f32 (v2f32 DPR:$Rn), (v2f32 DPR:$Rm), (i32 1))>;
+ def : Pat<(v4f32 (int_arm_neon_vcadd_rot90 (v4f32 QPR:$Rn), (v4f32 QPR:$Rm))),
+ (VCADDv4f32 (v4f32 QPR:$Rn), (v4f32 QPR:$Rm), (i32 0))>;
+ def : Pat<(v4f32 (int_arm_neon_vcadd_rot270 (v4f32 QPR:$Rn), (v4f32 QPR:$Rm))),
+ (VCADDv4f32 (v4f32 QPR:$Rn), (v4f32 QPR:$Rm), (i32 1))>;
+}
+
// Vector Subtract Operations.
// VSUB : Vector Subtract (integer and floating-point)
diff --git a/llvm/test/CodeGen/AArch64/neon-vcadd.ll b/llvm/test/CodeGen/AArch64/neon-vcadd.ll
new file mode 100644
index 000000000000..11605267c09b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-vcadd.ll
@@ -0,0 +1,67 @@
+; RUN: llc %s -mtriple=aarch64 -mattr=+v8.3a,+fullfp16 -o - | FileCheck %s
+
+define <4 x half> @foo16x4_rot(<4 x half> %a, <4 x half> %b) {
+entry:
+; CHECK-LABEL: foo16x4_rot
+; CHECK-DAG: fcadd v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #90
+; CHECK-DAG: fcadd v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #270
+ %vcadd_rot90_v2.i = tail call <4 x half> @llvm.aarch64.neon.vcadd.rot90.v4f16(<4 x half> %a, <4 x half> %b)
+ %vcadd_rot270_v2.i = tail call <4 x half> @llvm.aarch64.neon.vcadd.rot270.v4f16(<4 x half> %a, <4 x half> %b)
+ %add = fadd <4 x half> %vcadd_rot90_v2.i, %vcadd_rot270_v2.i
+ ret <4 x half> %add
+}
+
+define <2 x float> @foo32x2_rot(<2 x float> %a, <2 x float> %b) {
+entry:
+; CHECK-LABEL: foo32x2_rot
+; CHECK-DAG: fcadd v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #90
+; CHECK-DAG: fcadd v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #270
+ %vcadd_rot90_v2.i = tail call <2 x float> @llvm.aarch64.neon.vcadd.rot90.v2f32(<2 x float> %a, <2 x float> %b)
+ %vcadd_rot270_v2.i = tail call <2 x float> @llvm.aarch64.neon.vcadd.rot270.v2f32(<2 x float> %a, <2 x float> %b)
+ %add = fadd <2 x float> %vcadd_rot90_v2.i, %vcadd_rot270_v2.i
+ ret <2 x float> %add
+}
+
+define <8 x half> @foo16x8_rot(<8 x half> %a, <8 x half> %b) {
+entry:
+; CHECK-LABEL: foo16x8_rot
+; CHECK-DAG: fcadd v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, #90
+; CHECK-DAG: fcadd v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, #270
+ %vcaddq_rot90_v2.i = tail call <8 x half> @llvm.aarch64.neon.vcadd.rot90.v8f16(<8 x half> %a, <8 x half> %b)
+ %vcaddq_rot270_v2.i = tail call <8 x half> @llvm.aarch64.neon.vcadd.rot270.v8f16(<8 x half> %a, <8 x half> %b)
+ %add = fadd <8 x half> %vcaddq_rot90_v2.i, %vcaddq_rot270_v2.i
+ ret <8 x half> %add
+}
+
+define <4 x float> @foo32x4_rot(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: foo32x4_rot
+; CHECK-DAG: fcadd v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, #90
+; CHECK-DAG: fcadd v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, #270
+ %vcaddq_rot90_v2.i = tail call <4 x float> @llvm.aarch64.neon.vcadd.rot90.v4f32(<4 x float> %a, <4 x float> %b)
+ %vcaddq_rot270_v2.i = tail call <4 x float> @llvm.aarch64.neon.vcadd.rot270.v4f32(<4 x float> %a, <4 x float> %b)
+ %add = fadd <4 x float> %vcaddq_rot90_v2.i, %vcaddq_rot270_v2.i
+ ret <4 x float> %add
+}
+
+define <2 x double> @foo64x2_rot(<2 x double> %a, <2 x double> %b) {
+entry:
+; CHECK-LABEL: foo64x2_rot
+; CHECK-DAG: fcadd v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, #90
+; CHECK-DAG: fcadd v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, #270
+ %vcaddq_rot90_v2.i = tail call <2 x double> @llvm.aarch64.neon.vcadd.rot90.v2f64(<2 x double> %a, <2 x double> %b)
+ %vcaddq_rot270_v2.i = tail call <2 x double> @llvm.aarch64.neon.vcadd.rot270.v2f64(<2 x double> %a, <2 x double> %b)
+ %add = fadd <2 x double> %vcaddq_rot90_v2.i, %vcaddq_rot270_v2.i
+ ret <2 x double> %add
+}
+
+declare <4 x half> @llvm.aarch64.neon.vcadd.rot90.v4f16(<4 x half>, <4 x half>)
+declare <4 x half> @llvm.aarch64.neon.vcadd.rot270.v4f16(<4 x half>, <4 x half>)
+declare <2 x float> @llvm.aarch64.neon.vcadd.rot90.v2f32(<2 x float>, <2 x float>)
+declare <2 x float> @llvm.aarch64.neon.vcadd.rot270.v2f32(<2 x float>, <2 x float>)
+declare <8 x half> @llvm.aarch64.neon.vcadd.rot90.v8f16(<8 x half>, <8 x half>)
+declare <8 x half> @llvm.aarch64.neon.vcadd.rot270.v8f16(<8 x half>, <8 x half>)
+declare <4 x float> @llvm.aarch64.neon.vcadd.rot90.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.aarch64.neon.vcadd.rot270.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.vcadd.rot90.v2f64(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.aarch64.neon.vcadd.rot270.v2f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/ARM/neon-vcadd.ll b/llvm/test/CodeGen/ARM/neon-vcadd.ll
new file mode 100644
index 000000000000..93a85c8c73c6
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/neon-vcadd.ll
@@ -0,0 +1,54 @@
+; RUN: llc %s -mtriple=arm -mattr=+armv8.3-a,+fullfp16 -o - | FileCheck %s
+
+define <4 x half> @foo16x4_rot(<4 x half> %a, <4 x half> %b) {
+entry:
+; CHECK-LABEL: foo16x4_rot
+; CHECK-DAG: vcadd.f16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #90
+; CHECK-DAG: vcadd.f16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #270
+ %vcadd_rot90_v2.i = tail call <4 x half> @llvm.arm.neon.vcadd.rot90.v4f16(<4 x half> %a, <4 x half> %b)
+ %vcadd_rot270_v2.i = tail call <4 x half> @llvm.arm.neon.vcadd.rot270.v4f16(<4 x half> %a, <4 x half> %b)
+ %add = fadd <4 x half> %vcadd_rot90_v2.i, %vcadd_rot270_v2.i
+ ret <4 x half> %add
+}
+
+define <2 x float> @foo32x2_rot(<2 x float> %a, <2 x float> %b) {
+entry:
+; CHECK-LABEL: foo32x2_rot
+; CHECK-DAG: vcadd.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #90
+; CHECK-DAG: vcadd.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #270
+ %vcadd_rot90_v2.i = tail call <2 x float> @llvm.arm.neon.vcadd.rot90.v2f32(<2 x float> %a, <2 x float> %b)
+ %vcadd_rot270_v2.i = tail call <2 x float> @llvm.arm.neon.vcadd.rot270.v2f32(<2 x float> %a, <2 x float> %b)
+ %add = fadd <2 x float> %vcadd_rot90_v2.i, %vcadd_rot270_v2.i
+ ret <2 x float> %add
+}
+
+define <8 x half> @foo16x8_rot(<8 x half> %a, <8 x half> %b) {
+entry:
+; CHECK-LABEL: foo16x8_rot
+; CHECK-DAG: vcadd.f16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #90
+; CHECK-DAG: vcadd.f16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #270
+ %vcaddq_rot90_v2.i = tail call <8 x half> @llvm.arm.neon.vcadd.rot90.v8f16(<8 x half> %a, <8 x half> %b)
+ %vcaddq_rot270_v2.i = tail call <8 x half> @llvm.arm.neon.vcadd.rot270.v8f16(<8 x half> %a, <8 x half> %b)
+ %add = fadd <8 x half> %vcaddq_rot90_v2.i, %vcaddq_rot270_v2.i
+ ret <8 x half> %add
+}
+
+define <4 x float> @foo32x4_rot(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: foo32x4_rot
+; CHECK-DAG: vcadd.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #90
+; CHECK-DAG: vcadd.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #270
+ %vcaddq_rot90_v2.i = tail call <4 x float> @llvm.arm.neon.vcadd.rot90.v4f32(<4 x float> %a, <4 x float> %b)
+ %vcaddq_rot270_v2.i = tail call <4 x float> @llvm.arm.neon.vcadd.rot270.v4f32(<4 x float> %a, <4 x float> %b)
+ %add = fadd <4 x float> %vcaddq_rot90_v2.i, %vcaddq_rot270_v2.i
+ ret <4 x float> %add
+}
+
+declare <4 x half> @llvm.arm.neon.vcadd.rot90.v4f16(<4 x half>, <4 x half>)
+declare <4 x half> @llvm.arm.neon.vcadd.rot270.v4f16(<4 x half>, <4 x half>)
+declare <2 x float> @llvm.arm.neon.vcadd.rot90.v2f32(<2 x float>, <2 x float>)
+declare <2 x float> @llvm.arm.neon.vcadd.rot270.v2f32(<2 x float>, <2 x float>)
+declare <8 x half> @llvm.arm.neon.vcadd.rot90.v8f16(<8 x half>, <8 x half>)
+declare <8 x half> @llvm.arm.neon.vcadd.rot270.v8f16(<8 x half>, <8 x half>)
+declare <4 x float> @llvm.arm.neon.vcadd.rot90.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.arm.neon.vcadd.rot270.v4f32(<4 x float>, <4 x float>)
More information about the cfe-commits
mailing list