[llvm] 7a85e35 - [ARM,CDE] Implement GPR CDE intrinsics

Fri Mar 20 07:02:40 PDT 2020

Author: Mikhail Maltsev
Date: 2020-03-20T14:01:51Z
New Revision: 7a85e3585ec59b1bfe3b08072ff042af80d07f22

URL: https://github.com/llvm/llvm-project/commit/7a85e3585ec59b1bfe3b08072ff042af80d07f22
DIFF: https://github.com/llvm/llvm-project/commit/7a85e3585ec59b1bfe3b08072ff042af80d07f22.diff

LOG: [ARM,CDE] Implement GPR CDE intrinsics

Summary:
This change implements ACLE CDE intrinsics that translate to
instructions working with general-purpose registers.

The specification is available at
https://static.docs.arm.com/101028/0010/ACLE_2019Q4_release-0010.pdf

Each ACLE intrinsic gets a corresponding LLVM IR intrinsic (because
they have distinct function prototypes). Dual-register operands are
represented as pairs of i32 values. Because of this the instruction
selection for these intrinsics cannot be represented as TableGen
patterns and requires custom C++ code.

Reviewers: simon_tatham, MarkMurrayARM, dmgreen, ostannard

Reviewed By: MarkMurrayARM

Subscribers: kristof.beyls, hiraditya, danielkiss, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D76296

Added: 
    llvm/test/CodeGen/Thumb2/cde-gpr.ll

Modified: 
    clang/include/clang/Basic/arm_cde.td
    clang/test/CodeGen/arm-cde-gpr.c
    clang/test/Sema/arm-cde-immediates.c
    llvm/include/llvm/IR/IntrinsicsARM.td
    llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
    llvm/lib/Target/ARM/ARMInstrCDE.td

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/arm_cde.td b/clang/include/clang/Basic/arm_cde.td
index 222b63966a38..139007d387a0 100644

--- a/clang/include/clang/Basic/arm_cde.td
+++ b/clang/include/clang/Basic/arm_cde.td
@@ -13,6 +13,7 @@
 
 include "arm_mve_defs.td"
 
+// ACLE CDE intrinsic
 class CDEIntrinsic<Type ret, dag args, dag codegen>
   : Intrinsic<ret, args, codegen> {
   let builtinExtension = "cde";
@@ -40,6 +41,32 @@ def imm_11b : CDEImmediateBits<11>;
 def imm_12b : CDEImmediateBits<12>;
 def imm_13b : CDEImmediateBits<13>;
 
-let pnt = PNT_None, params = T.None in
-def cx1 : CDEIntrinsic<u32, (args imm_coproc:$cp, imm_13b:$imm),
-                            (CDEIRInt<"cx1"> $cp, $imm)>;
+// CX* instructions operating on GPRs
+multiclass CDE_CX_m<dag argsImm, dag argsReg, dag cgArgs> {
+  defvar cp = (args imm_coproc:$cp);
+  let pnt = PNT_None, params = T.None in {
+    def "" : CDEIntrinsic<u32, !con(cp, argsReg, argsImm),
+                               !con((CDEIRInt<NAME> $cp), cgArgs, (? $imm))>;
+    def a  : CDEIntrinsic<u32, !con(cp, (args u32:$acc), argsReg, argsImm),
+                               !con((CDEIRInt<NAME # "a"> $cp, $acc),
+                                    cgArgs, (? $imm))>;
+
+    def d :
+      CDEIntrinsic<u64, !con(cp, argsReg, argsImm),
+            (seq !con((CDEIRInt<NAME # "d"> $cp), cgArgs, (? $imm)):$pair,
+                 (or (shl (u64 (xval $pair, 1)), (u64 32)),
+                          (u64 (xval $pair, 0))))>;
+    def da :
+      CDEIntrinsic<u64, !con(cp, (args u64:$acc), argsReg, argsImm),
+            (seq (u32 (lshr $acc, (u64 32))):$acc_hi,
+                 (u32 $acc):$acc_lo,
+                 !con((CDEIRInt<NAME # "da"> $cp, $acc_lo, $acc_hi), cgArgs,
+                       (? $imm)):$pair,
+                 (or (shl (u64 (xval $pair, 1)), (u64 32)),
+                          (u64 (xval $pair, 0))))>;
+  }
+}
+
+defm cx1 : CDE_CX_m<(args imm_13b:$imm), (args), (?)>;
+defm cx2 : CDE_CX_m<(args imm_9b:$imm), (args u32:$n), (? $n)>;
+defm cx3 : CDE_CX_m<(args imm_6b:$imm), (args u32:$n, u32:$m), (? $n, $m)>;

diff  --git a/clang/test/CodeGen/arm-cde-gpr.c b/clang/test/CodeGen/arm-cde-gpr.c
index 9a24b1540b67..1e6893d7d2f8 100644
--- a/clang/test/CodeGen/arm-cde-gpr.c
+++ b/clang/test/CodeGen/arm-cde-gpr.c
@@ -11,6 +11,150 @@
 // CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx1(i32 0, i32 123)
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
-uint32_t test_cx1() {
+uint32_t test_cx1(void) {
   return __arm_cx1(0, 123);
 }
+
+// CHECK-LABEL: @test_cx1a(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx1a(i32 0, i32 [[ACC:%.*]], i32 345)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_cx1a(uint32_t acc) {
+  return __arm_cx1a(0, acc, 345);
+}
+
+// CHECK-LABEL: @test_cx1d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.cde.cx1d(i32 1, i32 567)
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT:    ret i64 [[TMP6]]
+//
+uint64_t test_cx1d(void) {
+  return __arm_cx1d(1, 567);
+}
+
+// CHECK-LABEL: @test_cx1da(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[ACC:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[ACC]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.cde.cx1da(i32 0, i32 [[TMP2]], i32 [[TMP1]], i32 789)
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+uint64_t test_cx1da(uint64_t acc) {
+  return __arm_cx1da(0, acc, 789);
+}
+
+// CHECK-LABEL: @test_cx2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx2(i32 0, i32 [[N:%.*]], i32 11)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_cx2(uint32_t n) {
+  return __arm_cx2(0, n, 11);
+}
+
+// CHECK-LABEL: @test_cx2a(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx2a(i32 1, i32 [[ACC:%.*]], i32 [[N:%.*]], i32 22)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_cx2a(uint32_t acc, uint32_t n) {
+  return __arm_cx2a(1, acc, n, 22);
+}
+
+// CHECK-LABEL: @test_cx2d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.cde.cx2d(i32 1, i32 [[N:%.*]], i32 33)
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT:    ret i64 [[TMP6]]
+//
+uint64_t test_cx2d(uint32_t n) {
+  return __arm_cx2d(1, n, 33);
+}
+
+// CHECK-LABEL: @test_cx2da(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[ACC:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[ACC]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.cde.cx2da(i32 0, i32 [[TMP2]], i32 [[TMP1]], i32 [[N:%.*]], i32 44)
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+uint64_t test_cx2da(uint64_t acc, uint32_t n) {
+  return __arm_cx2da(0, acc, n, 44);
+}
+
+// CHECK-LABEL: @test_cx3(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx3(i32 0, i32 [[N:%.*]], i32 [[M:%.*]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_cx3(uint32_t n, uint32_t m) {
+  return __arm_cx3(0, n, m, 1);
+}
+
+// CHECK-LABEL: @test_cx3a(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx3a(i32 1, i32 [[ACC:%.*]], i32 [[N:%.*]], i32 [[M:%.*]], i32 2)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_cx3a(uint32_t acc, uint32_t n, uint32_t m) {
+  return __arm_cx3a(1, acc, n, m, 2);
+}
+
+// CHECK-LABEL: @test_cx3d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.cde.cx3d(i32 1, i32 [[N:%.*]], i32 [[M:%.*]], i32 3)
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 32
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]]
+// CHECK-NEXT:    ret i64 [[TMP6]]
+//
+uint64_t test_cx3d(uint32_t n, uint32_t m) {
+  return __arm_cx3d(1, n, m, 3);
+}
+
+// CHECK-LABEL: @test_cx3da(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[ACC:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[ACC]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.cde.cx3da(i32 0, i32 [[TMP2]], i32 [[TMP1]], i32 [[N:%.*]], i32 [[M:%.*]], i32 4)
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+uint64_t test_cx3da(uint64_t acc, uint32_t n, uint32_t m) {
+  return __arm_cx3da(0, acc, n, m, 4);
+}

diff  --git a/clang/test/Sema/arm-cde-immediates.c b/clang/test/Sema/arm-cde-immediates.c
index bbc13668a2a1..d521e099c7d1 100644
--- a/clang/test/Sema/arm-cde-immediates.c
+++ b/clang/test/Sema/arm-cde-immediates.c
@@ -4,37 +4,62 @@
 #include <arm_acle.h>
 
 void test_coproc_gcp_instr(int a) {
-  __builtin_arm_cdp(0, 2, 3, 4, 5, 6); // expected-error {{coprocessor 0 must be configured as GCP}}
-  __builtin_arm_cdp2(0, 2, 3, 4, 5, 6); // expected-error {{coprocessor 0 must be configured as GCP}}
-  __builtin_arm_mcr(0, 0, a, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}}
+  __builtin_arm_cdp(0, 2, 3, 4, 5, 6);   // expected-error {{coprocessor 0 must be configured as GCP}}
+  __builtin_arm_cdp2(0, 2, 3, 4, 5, 6);  // expected-error {{coprocessor 0 must be configured as GCP}}
+  __builtin_arm_mcr(0, 0, a, 13, 0, 3);  // expected-error {{coprocessor 0 must be configured as GCP}}
   __builtin_arm_mcr2(0, 0, a, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}}
-  __builtin_arm_mrc(0, 0, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}}
-  __builtin_arm_mrc2(0, 0, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}}
-  __builtin_arm_mcrr(0, 0, a, 0); // expected-error {{coprocessor 0 must be configured as GCP}}
-  __builtin_arm_mcrr2(0, 0, a, 0); // expected-error {{coprocessor 0 must be configured as GCP}}
-  __builtin_arm_mrrc(0, 0, 0); // expected-error {{coprocessor 0 must be configured as GCP}}
-  __builtin_arm_mrrc2(0, 0, 0); // expected-error {{coprocessor 0 must be configured as GCP}}
-  __builtin_arm_ldc(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
-  __builtin_arm_ldcl(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
-  __builtin_arm_ldc2(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
-  __builtin_arm_ldc2l(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
-  __builtin_arm_stc(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
-  __builtin_arm_stcl(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
-  __builtin_arm_stc2(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
-  __builtin_arm_stc2l(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}}
+  __builtin_arm_mrc(0, 0, 13, 0, 3);     // expected-error {{coprocessor 0 must be configured as GCP}}
+  __builtin_arm_mrc2(0, 0, 13, 0, 3);    // expected-error {{coprocessor 0 must be configured as GCP}}
+  __builtin_arm_mcrr(0, 0, a, 0);        // expected-error {{coprocessor 0 must be configured as GCP}}
+  __builtin_arm_mcrr2(0, 0, a, 0);       // expected-error {{coprocessor 0 must be configured as GCP}}
+  __builtin_arm_mrrc(0, 0, 0);           // expected-error {{coprocessor 0 must be configured as GCP}}
+  __builtin_arm_mrrc2(0, 0, 0);          // expected-error {{coprocessor 0 must be configured as GCP}}
+  __builtin_arm_ldc(0, 2, &a);           // expected-error {{coprocessor 0 must be configured as GCP}}
+  __builtin_arm_ldcl(0, 2, &a);          // expected-error {{coprocessor 0 must be configured as GCP}}
+  __builtin_arm_ldc2(0, 2, &a);          // expected-error {{coprocessor 0 must be configured as GCP}}
+  __builtin_arm_ldc2l(0, 2, &a);         // expected-error {{coprocessor 0 must be configured as GCP}}
+  __builtin_arm_stc(0, 2, &a);           // expected-error {{coprocessor 0 must be configured as GCP}}
+  __builtin_arm_stcl(0, 2, &a);          // expected-error {{coprocessor 0 must be configured as GCP}}
+  __builtin_arm_stc2(0, 2, &a);          // expected-error {{coprocessor 0 must be configured as GCP}}
+  __builtin_arm_stc2l(0, 2, &a);         // expected-error {{coprocessor 0 must be configured as GCP}}
 }
 
 void test_coproc(uint32_t a) {
   (void)__arm_cx1(0, 0);
-  __arm_cx1(a, 0); // expected-error {{argument to '__arm_cx1' must be a constant integer}}
+  __arm_cx1(a, 0);  // expected-error {{argument to '__arm_cx1' must be a constant integer}}
   __arm_cx1(-1, 0); // expected-error {{argument value -1 is outside the valid range [0, 7]}}
   __arm_cx1(8, 0);  // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  __arm_cx1(1, 0); // expected-error {{coprocessor 1 must be configured as CDE}}
+  __arm_cx1(1, 0);  // expected-error {{coprocessor 1 must be configured as CDE}}
 }
 
-void test_cx(uint32_t a) {
+void test_cx(uint32_t a, uint64_t da, uint32_t n, uint32_t m) {
   (void)__arm_cx1(0, 0);
-  __arm_cx1(a, 0); // expected-error {{argument to '__arm_cx1' must be a constant integer}}
-  __arm_cx1(0, a);  // expected-error {{argument to '__arm_cx1' must be a constant integer}}
-  __arm_cx1(0, 8192);  // expected-error {{argument value 8192 is outside the valid range [0, 8191]}}
+  __arm_cx1(0, a);          // expected-error {{argument to '__arm_cx1' must be a constant integer}}
+  __arm_cx1(0, 8192);       // expected-error {{argument value 8192 is outside the valid range [0, 8191]}}
+  __arm_cx1a(0, a, a);      // expected-error {{argument to '__arm_cx1a' must be a constant integer}}
+  __arm_cx1a(0, a, 8192);   // expected-error {{argument value 8192 is outside the valid range [0, 8191]}}
+  __arm_cx1d(0, a);         // expected-error {{argument to '__arm_cx1d' must be a constant integer}}
+  __arm_cx1d(0, 8192);      // expected-error {{argument value 8192 is outside the valid range [0, 8191]}}
+  __arm_cx1da(0, da, a);    // expected-error {{argument to '__arm_cx1da' must be a constant integer}}
+  __arm_cx1da(0, da, 8192); // expected-error {{argument value 8192 is outside the valid range [0, 8191]}}
+
+  (void)__arm_cx2(0, n, 0);
+  __arm_cx2(0, n, a);         // expected-error {{argument to '__arm_cx2' must be a constant integer}}
+  __arm_cx2(0, n, 512);       // expected-error {{argument value 512 is outside the valid range [0, 511]}}
+  __arm_cx2a(0, a, n, a);     // expected-error {{argument to '__arm_cx2a' must be a constant integer}}
+  __arm_cx2a(0, a, n, 512);   // expected-error {{argument value 512 is outside the valid range [0, 511]}}
+  __arm_cx2d(0, n, a);        // expected-error {{argument to '__arm_cx2d' must be a constant integer}}
+  __arm_cx2d(0, n, 512);      // expected-error {{argument value 512 is outside the valid range [0, 511]}}
+  __arm_cx2da(0, da, n, a);   // expected-error {{argument to '__arm_cx2da' must be a constant integer}}
+  __arm_cx2da(0, da, n, 512); // expected-error {{argument value 512 is outside the valid range [0, 511]}}
+
+  (void)__arm_cx3(0, n, m, 0);
+  __arm_cx3(0, n, m, a);        // expected-error {{argument to '__arm_cx3' must be a constant integer}}
+  __arm_cx3(0, n, m, 64);       // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  __arm_cx3a(0, a, n, m, a);    // expected-error {{argument to '__arm_cx3a' must be a constant integer}}
+  __arm_cx3a(0, a, n, m, 64);   // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  __arm_cx3d(0, n, m, a);       // expected-error {{argument to '__arm_cx3d' must be a constant integer}}
+  __arm_cx3d(0, n, m, 64);      // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  __arm_cx3da(0, da, n, m, a);  // expected-error {{argument to '__arm_cx3da' must be a constant integer}}
+  __arm_cx3da(0, da, n, m, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
 }

diff  --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index 80ed0792a209..ba0cf909e5de 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -1275,9 +1275,30 @@ defm int_arm_mve_vqdmlad: MVEPredicated<[llvm_anyvector_ty],
 
 // CDE (Custom Datapath Extension)
 
-def int_arm_cde_cx1: Intrinsic<
-  [llvm_i32_ty],
-  [llvm_i32_ty /* coproc */, llvm_i32_ty /* imm */],
-  [IntrNoMem, ImmArg<0>, ImmArg<1>]>;
+multiclass CDEGPRIntrinsics<list<LLVMType> args> {
+  def "" : Intrinsic<
+    [llvm_i32_ty],
+    !listconcat([llvm_i32_ty /* coproc */], args, [llvm_i32_ty /* imm */]),
+    [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 1)>]>;
+  def a : Intrinsic<
+    [llvm_i32_ty],
+    !listconcat([llvm_i32_ty /* coproc */, llvm_i32_ty /* acc */], args,
+                [llvm_i32_ty /* imm */]),
+    [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 2)>]>;
+
+  def d: Intrinsic<
+    [llvm_i32_ty /* lo */, llvm_i32_ty /* hi */],
+    !listconcat([llvm_i32_ty /* coproc */], args, [llvm_i32_ty /* imm */]),
+    [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 1)>]>;
+  def da: Intrinsic<
+    [llvm_i32_ty /* lo */, llvm_i32_ty /* hi */],
+    !listconcat([llvm_i32_ty /* coproc */, llvm_i32_ty /* acc_lo */,
+                 llvm_i32_ty /* acc_hi */], args, [llvm_i32_ty /* imm */]),
+    [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 3)>]>;
+}
+
+defm int_arm_cde_cx1: CDEGPRIntrinsics<[]>;
+defm int_arm_cde_cx2: CDEGPRIntrinsics<[llvm_i32_ty]>;
+defm int_arm_cde_cx3: CDEGPRIntrinsics<[llvm_i32_ty, llvm_i32_ty]>;
 
 } // end TargetPrefix

diff  --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 6be6f5da6379..b334f4156559 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -277,6 +277,15 @@ class ARMDAGToDAGISel : public SelectionDAGISel {
   void SelectMVE_VxDUP(SDNode *N, const uint16_t *Opcodes,
                        bool Wrapping, bool Predicated);
 
+  /// Select SelectCDE_CXxD - Select CDE dual-GPR instruction (one of CX1D,
+  /// CX1DA, CX2D, CX2DA, CX3, CX3DA).
+  /// \arg \c NumExtraOps number of extra operands besides the coprocossor,
+  ///                     the accumulator and the immediate operand, i.e. 0
+  ///                     for CX1*, 1 for CX2*, 2 for CX3*
+  /// \arg \c HasAccum whether the instruction has an accumulator operand
+  void SelectCDE_CXxD(SDNode *N, uint16_t Opcode, size_t NumExtraOps,
+                      bool HasAccum);
+
   /// SelectVLDDup - Select NEON load-duplicate intrinsics.  NumVecs
   /// should be 1, 2, 3 or 4.  The opcode array specifies the instructions used
   /// for loading D registers.
@@ -2809,6 +2818,69 @@ void ARMDAGToDAGISel::SelectMVE_VxDUP(SDNode *N, const uint16_t *Opcodes,
   CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops));
 }
 
+void ARMDAGToDAGISel::SelectCDE_CXxD(SDNode *N, uint16_t Opcode,
+                                     size_t NumExtraOps, bool HasAccum) {
+  bool IsBigEndian = CurDAG->getDataLayout().isBigEndian();
+  SDLoc Loc(N);
+  SmallVector<SDValue, 8> Ops;
+
+  unsigned OpIdx = 1;
+
+  // Convert and append the immediate operand designating the coprocessor.
+  SDValue ImmCorpoc = N->getOperand(OpIdx++);
+  uint32_t ImmCoprocVal = cast<ConstantSDNode>(ImmCorpoc)->getZExtValue();
+  Ops.push_back(getI32Imm(ImmCoprocVal, Loc));
+
+  // For accumulating variants copy the low and high order parts of the
+  // accumulator into a register pair and add it to the operand vector.
+  if (HasAccum) {
+    SDValue AccLo = N->getOperand(OpIdx++);
+    SDValue AccHi = N->getOperand(OpIdx++);
+    if (IsBigEndian)
+      std::swap(AccLo, AccHi);
+    Ops.push_back(SDValue(createGPRPairNode(MVT::Untyped, AccLo, AccHi), 0));
+  }
+
+  // Copy extra operands as-is.
+  for (size_t I = 0; I < NumExtraOps; I++)
+    Ops.push_back(N->getOperand(OpIdx++));
+
+  // Convert and append the immediate operand
+  SDValue Imm = N->getOperand(OpIdx);
+  uint32_t ImmVal = cast<ConstantSDNode>(Imm)->getZExtValue();
+  Ops.push_back(getI32Imm(ImmVal, Loc));
+
+  // Accumulating variants are IT-predicable, add predicate operands.
+  if (HasAccum) {
+    SDValue Pred = getAL(CurDAG, Loc);
+    SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+    Ops.push_back(Pred);
+    Ops.push_back(PredReg);
+  }
+
+  // Create the CDE intruction
+  SDNode *InstrNode = CurDAG->getMachineNode(Opcode, Loc, MVT::Untyped, Ops);
+  SDValue ResultPair = SDValue(InstrNode, 0);
+
+  // The original intrinsic had two outputs, and the output of the dual-register
+  // CDE instruction is a register pair. We need to extract the two subregisters
+  // and replace all uses of the original outputs with the extracted
+  // subregisters.
+  uint16_t SubRegs[2] = {ARM::gsub_0, ARM::gsub_1};
+  if (IsBigEndian)
+    std::swap(SubRegs[0], SubRegs[1]);
+
+  for (size_t ResIdx = 0; ResIdx < 2; ResIdx++) {
+    if (SDValue(N, ResIdx).use_empty())
+      continue;
+    SDValue SubReg = CurDAG->getTargetExtractSubreg(SubRegs[ResIdx], Loc,
+                                                    MVT::i32, ResultPair);
+    ReplaceUses(SDValue(N, ResIdx), SubReg);
+  }
+
+  CurDAG->RemoveDeadNode(N);
+}
+
 void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic,
                                    bool isUpdating, unsigned NumVecs,
                                    const uint16_t *DOpcodes,
@@ -4773,6 +4845,40 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
                       IntNo == Intrinsic::arm_mve_vdwdup_predicated);
       return;
     }
+
+    case Intrinsic::arm_cde_cx1d:
+    case Intrinsic::arm_cde_cx1da:
+    case Intrinsic::arm_cde_cx2d:
+    case Intrinsic::arm_cde_cx2da:
+    case Intrinsic::arm_cde_cx3d:
+    case Intrinsic::arm_cde_cx3da: {
+      bool HasAccum = IntNo == Intrinsic::arm_cde_cx1da ||
+                      IntNo == Intrinsic::arm_cde_cx2da ||
+                      IntNo == Intrinsic::arm_cde_cx3da;
+      size_t NumExtraOps;
+      uint16_t Opcode;
+      switch (IntNo) {
+      case Intrinsic::arm_cde_cx1d:
+      case Intrinsic::arm_cde_cx1da:
+        NumExtraOps = 0;
+        Opcode = HasAccum ? ARM::CDE_CX1DA : ARM::CDE_CX1D;
+        break;
+      case Intrinsic::arm_cde_cx2d:
+      case Intrinsic::arm_cde_cx2da:
+        NumExtraOps = 1;
+        Opcode = HasAccum ? ARM::CDE_CX2DA : ARM::CDE_CX2D;
+        break;
+      case Intrinsic::arm_cde_cx3d:
+      case Intrinsic::arm_cde_cx3da:
+        NumExtraOps = 2;
+        Opcode = HasAccum ? ARM::CDE_CX3DA : ARM::CDE_CX3D;
+        break;
+      default:
+        llvm_unreachable("Unexpected opcode");
+      }
+      SelectCDE_CXxD(N, Opcode, NumExtraOps, HasAccum);
+      return;
+    }
     }
     break;
   }

diff  --git a/llvm/lib/Target/ARM/ARMInstrCDE.td b/llvm/lib/Target/ARM/ARMInstrCDE.td
index fb02e9fefd8c..648911acd26c 100644
--- a/llvm/lib/Target/ARM/ARMInstrCDE.td
+++ b/llvm/lib/Target/ARM/ARMInstrCDE.td
@@ -215,6 +215,35 @@ def CDE_CX3A  : CDE_CX3_Instr<"cx3a",   cde_cx_params_single_acc>;
 def CDE_CX3D  : CDE_CX3_Instr<"cx3d",   cde_cx_params_dual_noacc>;
 def CDE_CX3DA : CDE_CX3_Instr<"cx3da",  cde_cx_params_dual_acc>;
 
+let Predicates = [HasCDE] in {
+  def : Pat<(i32 (int_arm_cde_cx1 timm:$coproc, timm:$imm)),
+            (i32 (CDE_CX1 p_imm:$coproc, imm_13b:$imm))>;
+  def : Pat<(i32 (int_arm_cde_cx1a timm:$coproc, GPRwithAPSR_NZCVnosp:$acc,
+                                   timm:$imm)),
+            (i32 (CDE_CX1A p_imm:$coproc, GPRwithAPSR_NZCVnosp:$acc,
+                           imm_13b:$imm))>;
+  def : Pat<(i32 (int_arm_cde_cx2 timm:$coproc, GPRwithAPSR_NZCVnosp:$n,
+                                  timm:$imm)),
+            (i32 (CDE_CX2 p_imm:$coproc, GPRwithAPSR_NZCVnosp:$n,
+                          imm_9b:$imm))>;
+  def : Pat<(i32 (int_arm_cde_cx2a timm:$coproc, GPRwithAPSR_NZCVnosp:$acc,
+                                   GPRwithAPSR_NZCVnosp:$n, timm:$imm)),
+            (i32 (CDE_CX2A p_imm:$coproc, GPRwithAPSR_NZCVnosp:$acc,
+                           GPRwithAPSR_NZCVnosp:$n, imm_9b:$imm))>;
+  def : Pat<(i32 (int_arm_cde_cx3 timm:$coproc, GPRwithAPSR_NZCVnosp:$n,
+                                  GPRwithAPSR_NZCVnosp:$m, timm:$imm)),
+            (i32 (CDE_CX3  p_imm:$coproc, GPRwithAPSR_NZCVnosp:$n,
+                           GPRwithAPSR_NZCVnosp:$m, imm_6b:$imm))>;
+  def : Pat<(i32 (int_arm_cde_cx3a timm:$coproc,
+                                   GPRwithAPSR_NZCVnosp:$acc,
+                                   GPRwithAPSR_NZCVnosp:$n,
+                                   GPRwithAPSR_NZCVnosp:$m, timm:$imm)),
+            (i32 (CDE_CX3A p_imm:$coproc,
+                           GPRwithAPSR_NZCVnosp:$acc,
+                           GPRwithAPSR_NZCVnosp:$n,
+                           GPRwithAPSR_NZCVnosp:$m, imm_6b:$imm))>;
+}
+
 class CDE_RequiresSReg : Requires<[HasCDE, HasFPRegs]>;
 class CDE_RequiresDReg : Requires<[HasCDE, HasFPRegs]>;
 class CDE_RequiresQReg : Requires<[HasCDE, HasMVEInt]>;

diff  --git a/llvm/test/CodeGen/Thumb2/cde-gpr.ll b/llvm/test/CodeGen/Thumb2/cde-gpr.ll
new file mode 100644
index 000000000000..19052125c14b
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/cde-gpr.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=thumbebv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -verify-machineinstrs -o - %s | FileCheck %s
+
+declare i32 @llvm.arm.cde.cx1(i32 immarg, i32 immarg)
+declare i32 @llvm.arm.cde.cx1a(i32 immarg, i32, i32 immarg)
+declare { i32, i32 } @llvm.arm.cde.cx1d(i32 immarg, i32 immarg)
+declare { i32, i32 } @llvm.arm.cde.cx1da(i32 immarg, i32, i32, i32 immarg)
+
+declare i32 @llvm.arm.cde.cx2(i32 immarg, i32, i32 immarg)
+declare i32 @llvm.arm.cde.cx2a(i32 immarg, i32, i32, i32 immarg)
+declare { i32, i32 } @llvm.arm.cde.cx2d(i32 immarg, i32, i32 immarg)
+declare { i32, i32 } @llvm.arm.cde.cx2da(i32 immarg, i32, i32, i32, i32 immarg)
+
+declare i32 @llvm.arm.cde.cx3(i32 immarg, i32, i32, i32 immarg)
+declare i32 @llvm.arm.cde.cx3a(i32 immarg, i32, i32, i32, i32 immarg)
+declare { i32, i32 } @llvm.arm.cde.cx3d(i32 immarg, i32, i32, i32 immarg)
+declare { i32, i32 } @llvm.arm.cde.cx3da(i32 immarg, i32, i32, i32, i32, i32 immarg)
+
+define arm_aapcs_vfpcc i32 @test_cx1() {
+; CHECK-LABEL: test_cx1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cx1 p0, r0, #123
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.cde.cx1(i32 0, i32 123)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_cx1a(i32 %acc) {
+; CHECK-LABEL: test_cx1a:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cx1a p0, r0, #345
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.cde.cx1a(i32 0, i32 %acc, i32 345)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i64 @test_cx1d() {
+; CHECK-LABEL: test_cx1d:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cx1d p1, r0, r1, #567
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call { i32, i32 } @llvm.arm.cde.cx1d(i32 1, i32 567)
+  %1 = extractvalue { i32, i32 } %0, 1
+  %2 = zext i32 %1 to i64
+  %3 = shl i64 %2, 32
+  %4 = extractvalue { i32, i32 } %0, 0
+  %5 = zext i32 %4 to i64
+  %6 = or i64 %3, %5
+  ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_cx1da(i64 %acc) {
+; CHECK-LABEL: test_cx1da:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    @ kill: def $r1 killed $r1 killed $r0_r1 def $r0_r1
+; CHECK-NEXT:    @ kill: def $r0 killed $r0 killed $r0_r1 def $r0_r1
+; CHECK-NEXT:    cx1da p0, r0, r1, #789
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %acc, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %acc to i32
+  %3 = call { i32, i32 } @llvm.arm.cde.cx1da(i32 0, i32 %2, i32 %1, i32 789)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+define arm_aapcs_vfpcc i32 @test_cx2(i32 %n) {
+; CHECK-LABEL: test_cx2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cx2 p0, r0, r0, #11
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.cde.cx2(i32 0, i32 %n, i32 11)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_cx2a(i32 %acc, i32 %n) {
+; CHECK-LABEL: test_cx2a:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cx2a p1, r0, r1, #22
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.cde.cx2a(i32 1, i32 %acc, i32 %n, i32 22)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i64 @test_cx2d(i32 %n) #0 {
+; CHECK-LABEL: test_cx2d:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cx2d p1, r0, r1, r0, #33
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call { i32, i32 } @llvm.arm.cde.cx2d(i32 1, i32 %n, i32 33)
+  %1 = extractvalue { i32, i32 } %0, 1
+  %2 = zext i32 %1 to i64
+  %3 = shl i64 %2, 32
+  %4 = extractvalue { i32, i32 } %0, 0
+  %5 = zext i32 %4 to i64
+  %6 = or i64 %3, %5
+  ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_cx2da(i64 %acc, i32 %n) {
+; CHECK-LABEL: test_cx2da:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    @ kill: def $r1 killed $r1 killed $r0_r1 def $r0_r1
+; CHECK-NEXT:    @ kill: def $r0 killed $r0 killed $r0_r1 def $r0_r1
+; CHECK-NEXT:    cx2da p0, r0, r1, r2, #44
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %acc, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %acc to i32
+  %3 = call { i32, i32 } @llvm.arm.cde.cx2da(i32 0, i32 %2, i32 %1, i32 %n, i32 44)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+define arm_aapcs_vfpcc i32 @test_cx3(i32 %n, i32 %m) {
+; CHECK-LABEL: test_cx3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cx3 p0, r0, r0, r1, #1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.cde.cx3(i32 0, i32 %n, i32 %m, i32 1)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i32 @test_cx3a(i32 %acc, i32 %n, i32 %m) {
+; CHECK-LABEL: test_cx3a:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cx3a p1, r0, r1, r2, #2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.cde.cx3a(i32 1, i32 %acc, i32 %n, i32 %m, i32 2)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc i64 @test_cx3d(i32 %n, i32 %m) {
+; CHECK-LABEL: test_cx3d:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cx3d p1, r0, r1, r0, r1, #3
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call { i32, i32 } @llvm.arm.cde.cx3d(i32 1, i32 %n, i32 %m, i32 3)
+  %1 = extractvalue { i32, i32 } %0, 1
+  %2 = zext i32 %1 to i64
+  %3 = shl i64 %2, 32
+  %4 = extractvalue { i32, i32 } %0, 0
+  %5 = zext i32 %4 to i64
+  %6 = or i64 %3, %5
+  ret i64 %6
+}
+
+define arm_aapcs_vfpcc i64 @test_cx3da(i64 %acc, i32 %n, i32 %m) {
+; CHECK-LABEL: test_cx3da:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    @ kill: def $r1 killed $r1 killed $r0_r1 def $r0_r1
+; CHECK-NEXT:    @ kill: def $r0 killed $r0 killed $r0_r1 def $r0_r1
+; CHECK-NEXT:    cx3da p0, r0, r1, r2, r3, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %acc, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %acc to i32
+  %3 = call { i32, i32 } @llvm.arm.cde.cx3da(i32 0, i32 %2, i32 %1, i32 %n, i32 %m, i32 4)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}