[clang] 7a85e35 - [ARM,CDE] Implement GPR CDE intrinsics
Mikhail Maltsev via cfe-commits
cfe-commits at lists.llvm.org
Fri Mar 20 07:02:38 PDT 2020
Author: Mikhail Maltsev
Date: 2020-03-20T14:01:51Z
New Revision: 7a85e3585ec59b1bfe3b08072ff042af80d07f22
URL: https://github.com/llvm/llvm-project/commit/7a85e3585ec59b1bfe3b08072ff042af80d07f22
DIFF: https://github.com/llvm/llvm-project/commit/7a85e3585ec59b1bfe3b08072ff042af80d07f22.diff
LOG: [ARM,CDE] Implement GPR CDE intrinsics
Summary:
This change implements ACLE CDE intrinsics that translate to
instructions working with general-purpose registers.
The specification is available at
https://static.docs.arm.com/101028/0010/ACLE_2019Q4_release-0010.pdf
Each ACLE intrinsic gets a corresponding LLVM IR intrinsic (because
they have distinct function prototypes). Dual-register operands are
represented as pairs of i32 values. Because of this the instruction
selection for these intrinsics cannot be represented as TableGen
patterns and requires custom C++ code.
Reviewers: simon_tatham, MarkMurrayARM, dmgreen, ostannard
Reviewed By: MarkMurrayARM
Subscribers: kristof.beyls, hiraditya, danielkiss, cfe-commits
Tags: #clang
Differential Revision: https://reviews.llvm.org/D76296
Added:
llvm/test/CodeGen/Thumb2/cde-gpr.ll
Modified:
clang/include/clang/Basic/arm_cde.td
clang/test/CodeGen/arm-cde-gpr.c
clang/test/Sema/arm-cde-immediates.c
llvm/include/llvm/IR/IntrinsicsARM.td
llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
llvm/lib/Target/ARM/ARMInstrCDE.td
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/arm_cde.td b/clang/include/clang/Basic/arm_cde.td
index 222b63966a38..139007d387a0 100644
--- a/clang/include/clang/Basic/arm_cde.td
+++ b/clang/include/clang/Basic/arm_cde.td
@@ -13,6 +13,7 @@
include "arm_mve_defs.td"
+// ACLE CDE intrinsic
class CDEIntrinsic<Type ret, dag args, dag codegen>
: Intrinsic<ret, args, codegen> {
let builtinExtension = "cde";
@@ -40,6 +41,32 @@ def imm_11b : CDEImmediateBits<11>;
def imm_12b : CDEImmediateBits<12>;
def imm_13b : CDEImmediateBits<13>;
-let pnt = PNT_None, params = T.None in
-def cx1 : CDEIntrinsic<u32, (args imm_coproc:$cp, imm_13b:$imm),
- (CDEIRInt<"cx1"> $cp, $imm)>;
+// CX* instructions operating on GPRs
+multiclass CDE_CX_m<dag argsImm, dag argsReg, dag cgArgs> {
+ defvar cp = (args imm_coproc:$cp);
+ let pnt = PNT_None, params = T.None in {
+ def "" : CDEIntrinsic<u32, !con(cp, argsReg, argsImm),
+ !con((CDEIRInt<NAME> $cp), cgArgs, (? $imm))>;
+ def a : CDEIntrinsic<u32, !con(cp, (args u32:$acc), argsReg, argsImm),
+ !con((CDEIRInt<NAME # "a"> $cp, $acc),
+ cgArgs, (? $imm))>;
+
+ def d :
+ CDEIntrinsic<u64, !con(cp, argsReg, argsImm),
+ (seq !con((CDEIRInt<NAME # "d"> $cp), cgArgs, (? $imm)):$pair,
+ (or (shl (u64 (xval $pair, 1)), (u64 32)),
+ (u64 (xval $pair, 0))))>;
+ def da :
+ CDEIntrinsic<u64, !con(cp, (args u64:$acc), argsReg, argsImm),
+ (seq (u32 (lshr $acc, (u64 32))):$acc_hi,
+ (u32 $acc):$acc_lo,
+ !con((CDEIRInt<NAME # "da"> $cp, $acc_lo, $acc_hi), cgArgs,
+ (? $imm)):$pair,
+ (or (shl (u64 (xval $pair, 1)), (u64 32)),
+ (u64 (xval $pair, 0))))>;
+ }
+}
+
+defm cx1 : CDE_CX_m<(args imm_13b:$imm), (args), (?)>;
+defm cx2 : CDE_CX_m<(args imm_9b:$imm), (args u32:$n), (? $n)>;
+defm cx3 : CDE_CX_m<(args imm_6b:$imm), (args u32:$n, u32:$m), (? $n, $m)>;
diff --git a/clang/test/CodeGen/arm-cde-gpr.c b/clang/test/CodeGen/arm-cde-gpr.c
index 9a24b1540b67..1e6893d7d2f8 100644
--- a/clang/test/CodeGen/arm-cde-gpr.c
+++ b/clang/test/CodeGen/arm-cde-gpr.c
@@ -11,6 +11,150 @@
// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx1(i32 0, i32 123)
// CHECK-NEXT: ret i32 [[TMP0]]
//
-uint32_t test_cx1() {
+uint32_t test_cx1(void) {
return __arm_cx1(0, 123);
}
+
+// CHECK-LABEL: @test_cx1a(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx1a(i32 0, i32 [[ACC:%.*]], i32 345)
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t test_cx1a(uint32_t acc) {
+ return __arm_cx1a(0, acc, 345);
+}
+
+// CHECK-LABEL: @test_cx1d(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.cde.cx1d(i32 1, i32 567)
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT: ret i64 [[TMP6]]
+//
+uint64_t test_cx1d(void) {
+ return __arm_cx1d(1, 567);
+}
+
+// CHECK-LABEL: @test_cx1da(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[ACC:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[ACC]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.cde.cx1da(i32 0, i32 [[TMP2]], i32 [[TMP1]], i32 789)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT: ret i64 [[TMP9]]
+//
+uint64_t test_cx1da(uint64_t acc) {
+ return __arm_cx1da(0, acc, 789);
+}
+
+// CHECK-LABEL: @test_cx2(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx2(i32 0, i32 [[N:%.*]], i32 11)
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t test_cx2(uint32_t n) {
+ return __arm_cx2(0, n, 11);
+}
+
+// CHECK-LABEL: @test_cx2a(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx2a(i32 1, i32 [[ACC:%.*]], i32 [[N:%.*]], i32 22)
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t test_cx2a(uint32_t acc, uint32_t n) {
+ return __arm_cx2a(1, acc, n, 22);
+}
+
+// CHECK-LABEL: @test_cx2d(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.cde.cx2d(i32 1, i32 [[N:%.*]], i32 33)
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT: ret i64 [[TMP6]]
+//
+uint64_t test_cx2d(uint32_t n) {
+ return __arm_cx2d(1, n, 33);
+}
+
+// CHECK-LABEL: @test_cx2da(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[ACC:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[ACC]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.cde.cx2da(i32 0, i32 [[TMP2]], i32 [[TMP1]], i32 [[N:%.*]], i32 44)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT: ret i64 [[TMP9]]
+//
+uint64_t test_cx2da(uint64_t acc, uint32_t n) {
+ return __arm_cx2da(0, acc, n, 44);
+}
+
+// CHECK-LABEL: @test_cx3(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx3(i32 0, i32 [[N:%.*]], i32 [[M:%.*]], i32 1)
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t test_cx3(uint32_t n, uint32_t m) {
+ return __arm_cx3(0, n, m, 1);
+}
+
+// CHECK-LABEL: @test_cx3a(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx3a(i32 1, i32 [[ACC:%.*]], i32 [[N:%.*]], i32 [[M:%.*]], i32 2)
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+uint32_t test_cx3a(uint32_t acc, uint32_t n, uint32_t m) {
+ return __arm_cx3a(1, acc, n, m, 2);
+}
+
+// CHECK-LABEL: @test_cx3d(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.cde.cx3d(i32 1, i32 [[N:%.*]], i32 [[M:%.*]], i32 3)
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT: ret i64 [[TMP6]]
+//
+uint64_t test_cx3d(uint32_t n, uint32_t m) {
+ return __arm_cx3d(1, n, m, 3);
+}
+
+// CHECK-LABEL: @test_cx3da(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[ACC:%.*]], 32
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[ACC]] to i32
+// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.cde.cx3da(i32 0, i32 [[TMP2]], i32 [[TMP1]], i32 [[N:%.*]], i32 [[M:%.*]], i32 4)
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT: ret i64 [[TMP9]]
+//
+uint64_t test_cx3da(uint64_t acc, uint32_t n, uint32_t m) {
+ return __arm_cx3da(0, acc, n, m, 4);
+}
diff --git a/clang/test/Sema/arm-cde-immediates.c b/clang/test/Sema/arm-cde-immediates.c
index bbc13668a2a1..d521e099c7d1 100644
--- a/clang/test/Sema/arm-cde-immediates.c
+++ b/clang/test/Sema/arm-cde-immediates.c
@@ -4,37 +4,62 @@
#include <arm_acle.h>
void test_coproc_gcp_instr(int a) {
- __builtin_arm_cdp(0, 2, 3, 4, 5, 6); // expected-error {{coprocessor 0 must be configured as GCP}}
- __builtin_arm_cdp2(0, 2, 3, 4, 5, 6); // expected-error {{coprocessor 0 must be configured as GCP}}
- __builtin_arm_mcr(0, 0, a, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}}
+ __builtin_arm_cdp(0, 2, 3, 4, 5, 6); // expected-error {{coprocessor 0 must be configured as GCP}}
+ __builtin_arm_cdp2(0, 2, 3, 4, 5, 6); // expected-error {{coprocessor 0 must be configured as GCP}}
+ __builtin_arm_mcr(0, 0, a, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}}
__builtin_arm_mcr2(0, 0, a, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}}
- __builtin_arm_mrc(0, 0, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}}
- __builtin_arm_mrc2(0, 0, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}}
- __builtin_arm_mcrr(0, 0, a, 0); // expected-error {{coprocessor 0 must be configured as GCP}}
- __builtin_arm_mcrr2(0, 0, a, 0); // expected-error {{coprocessor 0 must be configured as GCP}}
- __builtin_arm_mrrc(0, 0, 0); // expected-error {{coprocessor 0 must be configured as GCP}}
- __builtin_arm_mrrc2(0, 0, 0); // expected-error {{coprocessor 0 must be configured as GCP}}
- __builtin_arm_ldc(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
- __builtin_arm_ldcl(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
- __builtin_arm_ldc2(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
- __builtin_arm_ldc2l(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
- __builtin_arm_stc(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
- __builtin_arm_stcl(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
- __builtin_arm_stc2(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
- __builtin_arm_stc2l(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
+ __builtin_arm_mrc(0, 0, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}}
+ __builtin_arm_mrc2(0, 0, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}}
+ __builtin_arm_mcrr(0, 0, a, 0); // expected-error {{coprocessor 0 must be configured as GCP}}
+ __builtin_arm_mcrr2(0, 0, a, 0); // expected-error {{coprocessor 0 must be configured as GCP}}
+ __builtin_arm_mrrc(0, 0, 0); // expected-error {{coprocessor 0 must be configured as GCP}}
+ __builtin_arm_mrrc2(0, 0, 0); // expected-error {{coprocessor 0 must be configured as GCP}}
+ __builtin_arm_ldc(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
+ __builtin_arm_ldcl(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
+ __builtin_arm_ldc2(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
+ __builtin_arm_ldc2l(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
+ __builtin_arm_stc(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
+ __builtin_arm_stcl(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
+ __builtin_arm_stc2(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
+ __builtin_arm_stc2l(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
}
void test_coproc(uint32_t a) {
(void)__arm_cx1(0, 0);
- __arm_cx1(a, 0); // expected-error {{argument to '__arm_cx1' must be a constant integer}}
+ __arm_cx1(a, 0); // expected-error {{argument to '__arm_cx1' must be a constant integer}}
__arm_cx1(-1, 0); // expected-error {{argument value -1 is outside the valid range [0, 7]}}
__arm_cx1(8, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
- __arm_cx1(1, 0); // expected-error {{coprocessor 1 must be configured as CDE}}
+ __arm_cx1(1, 0); // expected-error {{coprocessor 1 must be configured as CDE}}
}
-void test_cx(uint32_t a) {
+void test_cx(uint32_t a, uint64_t da, uint32_t n, uint32_t m) {
(void)__arm_cx1(0, 0);
- __arm_cx1(a, 0); // expected-error {{argument to '__arm_cx1' must be a constant integer}}
- __arm_cx1(0, a); // expected-error {{argument to '__arm_cx1' must be a constant integer}}
- __arm_cx1(0, 8192); // expected-error {{argument value 8192 is outside the valid range [0, 8191]}}
+ __arm_cx1(0, a); // expected-error {{argument to '__arm_cx1' must be a constant integer}}
+ __arm_cx1(0, 8192); // expected-error {{argument value 8192 is outside the valid range [0, 8191]}}
+ __arm_cx1a(0, a, a); // expected-error {{argument to '__arm_cx1a' must be a constant integer}}
+ __arm_cx1a(0, a, 8192); // expected-error {{argument value 8192 is outside the valid range [0, 8191]}}
+ __arm_cx1d(0, a); // expected-error {{argument to '__arm_cx1d' must be a constant integer}}
+ __arm_cx1d(0, 8192); // expected-error {{argument value 8192 is outside the valid range [0, 8191]}}
+ __arm_cx1da(0, da, a); // expected-error {{argument to '__arm_cx1da' must be a constant integer}}
+ __arm_cx1da(0, da, 8192); // expected-error {{argument value 8192 is outside the valid range [0, 8191]}}
+
+ (void)__arm_cx2(0, n, 0);
+ __arm_cx2(0, n, a); // expected-error {{argument to '__arm_cx2' must be a constant integer}}
+ __arm_cx2(0, n, 512); // expected-error {{argument value 512 is outside the valid range [0, 511]}}
+ __arm_cx2a(0, a, n, a); // expected-error {{argument to '__arm_cx2a' must be a constant integer}}
+ __arm_cx2a(0, a, n, 512); // expected-error {{argument value 512 is outside the valid range [0, 511]}}
+ __arm_cx2d(0, n, a); // expected-error {{argument to '__arm_cx2d' must be a constant integer}}
+ __arm_cx2d(0, n, 512); // expected-error {{argument value 512 is outside the valid range [0, 511]}}
+ __arm_cx2da(0, da, n, a); // expected-error {{argument to '__arm_cx2da' must be a constant integer}}
+ __arm_cx2da(0, da, n, 512); // expected-error {{argument value 512 is outside the valid range [0, 511]}}
+
+ (void)__arm_cx3(0, n, m, 0);
+ __arm_cx3(0, n, m, a); // expected-error {{argument to '__arm_cx3' must be a constant integer}}
+ __arm_cx3(0, n, m, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+ __arm_cx3a(0, a, n, m, a); // expected-error {{argument to '__arm_cx3a' must be a constant integer}}
+ __arm_cx3a(0, a, n, m, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+ __arm_cx3d(0, n, m, a); // expected-error {{argument to '__arm_cx3d' must be a constant integer}}
+ __arm_cx3d(0, n, m, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+ __arm_cx3da(0, da, n, m, a); // expected-error {{argument to '__arm_cx3da' must be a constant integer}}
+ __arm_cx3da(0, da, n, m, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
}
diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index 80ed0792a209..ba0cf909e5de 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -1275,9 +1275,30 @@ defm int_arm_mve_vqdmlad: MVEPredicated<[llvm_anyvector_ty],
// CDE (Custom Datapath Extension)
-def int_arm_cde_cx1: Intrinsic<
- [llvm_i32_ty],
- [llvm_i32_ty /* coproc */, llvm_i32_ty /* imm */],
- [IntrNoMem, ImmArg<0>, ImmArg<1>]>;
+multiclass CDEGPRIntrinsics<list<LLVMType> args> {
+ def "" : Intrinsic<
+ [llvm_i32_ty],
+ !listconcat([llvm_i32_ty /* coproc */], args, [llvm_i32_ty /* imm */]),
+ [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 1)>]>;
+ def a : Intrinsic<
+ [llvm_i32_ty],
+ !listconcat([llvm_i32_ty /* coproc */, llvm_i32_ty /* acc */], args,
+ [llvm_i32_ty /* imm */]),
+ [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 2)>]>;
+
+ def d: Intrinsic<
+ [llvm_i32_ty /* lo */, llvm_i32_ty /* hi */],
+ !listconcat([llvm_i32_ty /* coproc */], args, [llvm_i32_ty /* imm */]),
+ [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 1)>]>;
+ def da: Intrinsic<
+ [llvm_i32_ty /* lo */, llvm_i32_ty /* hi */],
+ !listconcat([llvm_i32_ty /* coproc */, llvm_i32_ty /* acc_lo */,
+ llvm_i32_ty /* acc_hi */], args, [llvm_i32_ty /* imm */]),
+ [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 3)>]>;
+}
+
+defm int_arm_cde_cx1: CDEGPRIntrinsics<[]>;
+defm int_arm_cde_cx2: CDEGPRIntrinsics<[llvm_i32_ty]>;
+defm int_arm_cde_cx3: CDEGPRIntrinsics<[llvm_i32_ty, llvm_i32_ty]>;
} // end TargetPrefix
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 6be6f5da6379..b334f4156559 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -277,6 +277,15 @@ class ARMDAGToDAGISel : public SelectionDAGISel {
void SelectMVE_VxDUP(SDNode *N, const uint16_t *Opcodes,
bool Wrapping, bool Predicated);
+ /// Select SelectCDE_CXxD - Select CDE dual-GPR instruction (one of CX1D,
+ /// CX1DA, CX2D, CX2DA, CX3, CX3DA).
+ /// \arg \c NumExtraOps number of extra operands besides the coprocossor,
+ /// the accumulator and the immediate operand, i.e. 0
+ /// for CX1*, 1 for CX2*, 2 for CX3*
+ /// \arg \c HasAccum whether the instruction has an accumulator operand
+ void SelectCDE_CXxD(SDNode *N, uint16_t Opcode, size_t NumExtraOps,
+ bool HasAccum);
+
/// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs
/// should be 1, 2, 3 or 4. The opcode array specifies the instructions used
/// for loading D registers.
@@ -2809,6 +2818,69 @@ void ARMDAGToDAGISel::SelectMVE_VxDUP(SDNode *N, const uint16_t *Opcodes,
CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops));
}
+void ARMDAGToDAGISel::SelectCDE_CXxD(SDNode *N, uint16_t Opcode,
+ size_t NumExtraOps, bool HasAccum) {
+ bool IsBigEndian = CurDAG->getDataLayout().isBigEndian();
+ SDLoc Loc(N);
+ SmallVector<SDValue, 8> Ops;
+
+ unsigned OpIdx = 1;
+
+ // Convert and append the immediate operand designating the coprocessor.
+ SDValue ImmCorpoc = N->getOperand(OpIdx++);
+ uint32_t ImmCoprocVal = cast<ConstantSDNode>(ImmCorpoc)->getZExtValue();
+ Ops.push_back(getI32Imm(ImmCoprocVal, Loc));
+
+ // For accumulating variants copy the low and high order parts of the
+ // accumulator into a register pair and add it to the operand vector.
+ if (HasAccum) {
+ SDValue AccLo = N->getOperand(OpIdx++);
+ SDValue AccHi = N->getOperand(OpIdx++);
+ if (IsBigEndian)
+ std::swap(AccLo, AccHi);
+ Ops.push_back(SDValue(createGPRPairNode(MVT::Untyped, AccLo, AccHi), 0));
+ }
+
+ // Copy extra operands as-is.
+ for (size_t I = 0; I < NumExtraOps; I++)
+ Ops.push_back(N->getOperand(OpIdx++));
+
+ // Convert and append the immediate operand
+ SDValue Imm = N->getOperand(OpIdx);
+ uint32_t ImmVal = cast<ConstantSDNode>(Imm)->getZExtValue();
+ Ops.push_back(getI32Imm(ImmVal, Loc));
+
+ // Accumulating variants are IT-predicable, add predicate operands.
+ if (HasAccum) {
+ SDValue Pred = getAL(CurDAG, Loc);
+ SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+ Ops.push_back(Pred);
+ Ops.push_back(PredReg);
+ }
+
+ // Create the CDE intruction
+ SDNode *InstrNode = CurDAG->getMachineNode(Opcode, Loc, MVT::Untyped, Ops);
+ SDValue ResultPair = SDValue(InstrNode, 0);
+
+ // The original intrinsic had two outputs, and the output of the dual-register
+ // CDE instruction is a register pair. We need to extract the two subregisters
+ // and replace all uses of the original outputs with the extracted
+ // subregisters.
+ uint16_t SubRegs[2] = {ARM::gsub_0, ARM::gsub_1};
+ if (IsBigEndian)
+ std::swap(SubRegs[0], SubRegs[1]);
+
+ for (size_t ResIdx = 0; ResIdx < 2; ResIdx++) {
+ if (SDValue(N, ResIdx).use_empty())
+ continue;
+ SDValue SubReg = CurDAG->getTargetExtractSubreg(SubRegs[ResIdx], Loc,
+ MVT::i32, ResultPair);
+ ReplaceUses(SDValue(N, ResIdx), SubReg);
+ }
+
+ CurDAG->RemoveDeadNode(N);
+}
+
void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic,
bool isUpdating, unsigned NumVecs,
const uint16_t *DOpcodes,
@@ -4773,6 +4845,40 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
IntNo == Intrinsic::arm_mve_vdwdup_predicated);
return;
}
+
+ case Intrinsic::arm_cde_cx1d:
+ case Intrinsic::arm_cde_cx1da:
+ case Intrinsic::arm_cde_cx2d:
+ case Intrinsic::arm_cde_cx2da:
+ case Intrinsic::arm_cde_cx3d:
+ case Intrinsic::arm_cde_cx3da: {
+ bool HasAccum = IntNo == Intrinsic::arm_cde_cx1da ||
+ IntNo == Intrinsic::arm_cde_cx2da ||
+ IntNo == Intrinsic::arm_cde_cx3da;
+ size_t NumExtraOps;
+ uint16_t Opcode;
+ switch (IntNo) {
+ case Intrinsic::arm_cde_cx1d:
+ case Intrinsic::arm_cde_cx1da:
+ NumExtraOps = 0;
+ Opcode = HasAccum ? ARM::CDE_CX1DA : ARM::CDE_CX1D;
+ break;
+ case Intrinsic::arm_cde_cx2d:
+ case Intrinsic::arm_cde_cx2da:
+ NumExtraOps = 1;
+ Opcode = HasAccum ? ARM::CDE_CX2DA : ARM::CDE_CX2D;
+ break;
+ case Intrinsic::arm_cde_cx3d:
+ case Intrinsic::arm_cde_cx3da:
+ NumExtraOps = 2;
+ Opcode = HasAccum ? ARM::CDE_CX3DA : ARM::CDE_CX3D;
+ break;
+ default:
+ llvm_unreachable("Unexpected opcode");
+ }
+ SelectCDE_CXxD(N, Opcode, NumExtraOps, HasAccum);
+ return;
+ }
}
break;
}
diff --git a/llvm/lib/Target/ARM/ARMInstrCDE.td b/llvm/lib/Target/ARM/ARMInstrCDE.td
index fb02e9fefd8c..648911acd26c 100644
--- a/llvm/lib/Target/ARM/ARMInstrCDE.td
+++ b/llvm/lib/Target/ARM/ARMInstrCDE.td
@@ -215,6 +215,35 @@ def CDE_CX3A : CDE_CX3_Instr<"cx3a", cde_cx_params_single_acc>;
def CDE_CX3D : CDE_CX3_Instr<"cx3d", cde_cx_params_dual_noacc>;
def CDE_CX3DA : CDE_CX3_Instr<"cx3da", cde_cx_params_dual_acc>;
+let Predicates = [HasCDE] in {
+ def : Pat<(i32 (int_arm_cde_cx1 timm:$coproc, timm:$imm)),
+ (i32 (CDE_CX1 p_imm:$coproc, imm_13b:$imm))>;
+ def : Pat<(i32 (int_arm_cde_cx1a timm:$coproc, GPRwithAPSR_NZCVnosp:$acc,
+ timm:$imm)),
+ (i32 (CDE_CX1A p_imm:$coproc, GPRwithAPSR_NZCVnosp:$acc,
+ imm_13b:$imm))>;
+ def : Pat<(i32 (int_arm_cde_cx2 timm:$coproc, GPRwithAPSR_NZCVnosp:$n,
+ timm:$imm)),
+ (i32 (CDE_CX2 p_imm:$coproc, GPRwithAPSR_NZCVnosp:$n,
+ imm_9b:$imm))>;
+ def : Pat<(i32 (int_arm_cde_cx2a timm:$coproc, GPRwithAPSR_NZCVnosp:$acc,
+ GPRwithAPSR_NZCVnosp:$n, timm:$imm)),
+ (i32 (CDE_CX2A p_imm:$coproc, GPRwithAPSR_NZCVnosp:$acc,
+ GPRwithAPSR_NZCVnosp:$n, imm_9b:$imm))>;
+ def : Pat<(i32 (int_arm_cde_cx3 timm:$coproc, GPRwithAPSR_NZCVnosp:$n,
+ GPRwithAPSR_NZCVnosp:$m, timm:$imm)),
+ (i32 (CDE_CX3 p_imm:$coproc, GPRwithAPSR_NZCVnosp:$n,
+ GPRwithAPSR_NZCVnosp:$m, imm_6b:$imm))>;
+ def : Pat<(i32 (int_arm_cde_cx3a timm:$coproc,
+ GPRwithAPSR_NZCVnosp:$acc,
+ GPRwithAPSR_NZCVnosp:$n,
+ GPRwithAPSR_NZCVnosp:$m, timm:$imm)),
+ (i32 (CDE_CX3A p_imm:$coproc,
+ GPRwithAPSR_NZCVnosp:$acc,
+ GPRwithAPSR_NZCVnosp:$n,
+ GPRwithAPSR_NZCVnosp:$m, imm_6b:$imm))>;
+}
+
class CDE_RequiresSReg : Requires<[HasCDE, HasFPRegs]>;
class CDE_RequiresDReg : Requires<[HasCDE, HasFPRegs]>;
class CDE_RequiresQReg : Requires<[HasCDE, HasMVEInt]>;
diff --git a/llvm/test/CodeGen/Thumb2/cde-gpr.ll b/llvm/test/CodeGen/Thumb2/cde-gpr.ll
new file mode 100644
index 000000000000..19052125c14b
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/cde-gpr.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=thumbebv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -verify-machineinstrs -o - %s | FileCheck %s
+
+declare i32 @llvm.arm.cde.cx1(i32 immarg, i32 immarg)
+declare i32 @llvm.arm.cde.cx1a(i32 immarg, i32, i32 immarg)
+declare { i32, i32 } @llvm.arm.cde.cx1d(i32 immarg, i32 immarg)
+declare { i32, i32 } @llvm.arm.cde.cx1da(i32 immarg, i32, i32, i32 immarg)
+
+declare i32 @llvm.arm.cde.cx2(i32 immarg, i32, i32 immarg)
+declare i32 @llvm.arm.cde.cx2a(i32 immarg, i32, i32, i32 immarg)
+declare { i32, i32 } @llvm.arm.cde.cx2d(i32 immarg, i32, i32 immarg)
+declare { i32, i32 } @llvm.arm.cde.cx2da(i32 immarg, i32, i32, i32, i32 immarg)
+
+declare i32 @llvm.arm.cde.cx3(i32 immarg, i32, i32, i32 immarg)
+declare i32 @llvm.arm.cde.cx3a(i32 immarg, i32, i32, i32, i32 immarg)
+declare { i32, i32 } @llvm.arm.cde.cx3d(i32 immarg, i32, i32, i32 immarg)
+declare { i32, i32 } @llvm.arm.cde.cx3da(i32 immarg, i32, i32, i32, i32, i32 immarg)
+
+define arm_aapcs_vfpcc i32 @test_cx1() {
+; CHECK-LABEL: test_cx1:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: cx1 p0, r0, #123
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.cde.cx1(i32 0, i32 123)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_cx1a(i32 %acc) {
+; CHECK-LABEL: test_cx1a:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: cx1a p0, r0, #345
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.cde.cx1a(i32 0, i32 %acc, i32 345)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i64 @test_cx1d() {
+; CHECK-LABEL: test_cx1d:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: cx1d p1, r0, r1, #567
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call { i32, i32 } @llvm.arm.cde.cx1d(i32 1, i32 567)
+ %1 = extractvalue { i32, i32 } %0, 1
+ %2 = zext i32 %1 to i64
+ %3 = shl i64 %2, 32
+ %4 = extractvalue { i32, i32 } %0, 0
+ %5 = zext i32 %4 to i64
+ %6 = or i64 %3, %5
+ ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_cx1da(i64 %acc) {
+; CHECK-LABEL: test_cx1da:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: @ kill: def $r1 killed $r1 killed $r0_r1 def $r0_r1
+; CHECK-NEXT: @ kill: def $r0 killed $r0 killed $r0_r1 def $r0_r1
+; CHECK-NEXT: cx1da p0, r0, r1, #789
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %acc, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %acc to i32
+ %3 = call { i32, i32 } @llvm.arm.cde.cx1da(i32 0, i32 %2, i32 %1, i32 789)
+ %4 = extractvalue { i32, i32 } %3, 1
+ %5 = zext i32 %4 to i64
+ %6 = shl i64 %5, 32
+ %7 = extractvalue { i32, i32 } %3, 0
+ %8 = zext i32 %7 to i64
+ %9 = or i64 %6, %8
+ ret i64 %9
+}
+
+define arm_aapcs_vfpcc i32 @test_cx2(i32 %n) {
+; CHECK-LABEL: test_cx2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: cx2 p0, r0, r0, #11
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.cde.cx2(i32 0, i32 %n, i32 11)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_cx2a(i32 %acc, i32 %n) {
+; CHECK-LABEL: test_cx2a:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: cx2a p1, r0, r1, #22
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.cde.cx2a(i32 1, i32 %acc, i32 %n, i32 22)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i64 @test_cx2d(i32 %n) #0 {
+; CHECK-LABEL: test_cx2d:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: cx2d p1, r0, r1, r0, #33
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call { i32, i32 } @llvm.arm.cde.cx2d(i32 1, i32 %n, i32 33)
+ %1 = extractvalue { i32, i32 } %0, 1
+ %2 = zext i32 %1 to i64
+ %3 = shl i64 %2, 32
+ %4 = extractvalue { i32, i32 } %0, 0
+ %5 = zext i32 %4 to i64
+ %6 = or i64 %3, %5
+ ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_cx2da(i64 %acc, i32 %n) {
+; CHECK-LABEL: test_cx2da:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: @ kill: def $r1 killed $r1 killed $r0_r1 def $r0_r1
+; CHECK-NEXT: @ kill: def $r0 killed $r0 killed $r0_r1 def $r0_r1
+; CHECK-NEXT: cx2da p0, r0, r1, r2, #44
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %acc, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %acc to i32
+ %3 = call { i32, i32 } @llvm.arm.cde.cx2da(i32 0, i32 %2, i32 %1, i32 %n, i32 44)
+ %4 = extractvalue { i32, i32 } %3, 1
+ %5 = zext i32 %4 to i64
+ %6 = shl i64 %5, 32
+ %7 = extractvalue { i32, i32 } %3, 0
+ %8 = zext i32 %7 to i64
+ %9 = or i64 %6, %8
+ ret i64 %9
+}
+
+define arm_aapcs_vfpcc i32 @test_cx3(i32 %n, i32 %m) {
+; CHECK-LABEL: test_cx3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: cx3 p0, r0, r0, r1, #1
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.cde.cx3(i32 0, i32 %n, i32 %m, i32 1)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_cx3a(i32 %acc, i32 %n, i32 %m) {
+; CHECK-LABEL: test_cx3a:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: cx3a p1, r0, r1, r2, #2
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call i32 @llvm.arm.cde.cx3a(i32 1, i32 %acc, i32 %n, i32 %m, i32 2)
+ ret i32 %0
+}
+
+define arm_aapcs_vfpcc i64 @test_cx3d(i32 %n, i32 %m) {
+; CHECK-LABEL: test_cx3d:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: cx3d p1, r0, r1, r0, r1, #3
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call { i32, i32 } @llvm.arm.cde.cx3d(i32 1, i32 %n, i32 %m, i32 3)
+ %1 = extractvalue { i32, i32 } %0, 1
+ %2 = zext i32 %1 to i64
+ %3 = shl i64 %2, 32
+ %4 = extractvalue { i32, i32 } %0, 0
+ %5 = zext i32 %4 to i64
+ %6 = or i64 %3, %5
+ ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_cx3da(i64 %acc, i32 %n, i32 %m) {
+; CHECK-LABEL: test_cx3da:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: @ kill: def $r1 killed $r1 killed $r0_r1 def $r0_r1
+; CHECK-NEXT: @ kill: def $r0 killed $r0 killed $r0_r1 def $r0_r1
+; CHECK-NEXT: cx3da p0, r0, r1, r2, r3, #4
+; CHECK-NEXT: bx lr
+entry:
+ %0 = lshr i64 %acc, 32
+ %1 = trunc i64 %0 to i32
+ %2 = trunc i64 %acc to i32
+ %3 = call { i32, i32 } @llvm.arm.cde.cx3da(i32 0, i32 %2, i32 %1, i32 %n, i32 %m, i32 4)
+ %4 = extractvalue { i32, i32 } %3, 1
+ %5 = zext i32 %4 to i64
+ %6 = shl i64 %5, 32
+ %7 = extractvalue { i32, i32 } %3, 0
+ %8 = zext i32 %7 to i64
+ %9 = or i64 %6, %8
+ ret i64 %9
+}
More information about the cfe-commits
mailing list