[llvm-branch-commits] [clang] 51d5991 - [Clang] Add AArch64 VCMLA LANE variants.

Tue Jan 5 08:18:28 PST 2021

Author: Florian Hahn
Date: 2021-01-05T16:14:00Z
New Revision: 51d5991f04dda76c2f07123569b5c31ed3dfcfe8

URL: https://github.com/llvm/llvm-project/commit/51d5991f04dda76c2f07123569b5c31ed3dfcfe8
DIFF: https://github.com/llvm/llvm-project/commit/51d5991f04dda76c2f07123569b5c31ed3dfcfe8.diff

LOG: [Clang] Add AArch64 VCMLA LANE variants.

This patch adds the LANE variants for VCMLA on AArch64 as defined in
"Arm Neon Intrinsics Reference for ACLE Q3 2020" [1]

This patch also updates `dup_typed` to accept constant type strings directly.

Based on a patch by Tim Northover.

[1] https://developer.arm.com/documentation/ihi0073/latest

Reviewed By: SjoerdMeijer

Differential Revision: https://reviews.llvm.org/D93014

Added: 
    

Modified: 
    clang/include/clang/Basic/arm_neon.td
    clang/test/CodeGen/aarch64-neon-vcmla.c
    clang/utils/TableGen/NeonEmitter.cpp

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 6f1380d58c16..3b2a578f796e 100644

--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -1902,6 +1902,24 @@ let ArchGuard = "defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC)" in {
 multiclass VCMLA_ROTS<string type, string lanety, string laneqty> {
   foreach ROT = ["", "_rot90", "_rot180", "_rot270" ] in {
     def   : SInst<"vcmla" # ROT, "....", type # "Q" # type>;
+
+    // vcmla{ROT}_lane
+    def : SOpInst<"vcmla" # ROT # "_lane", "...qI", type, Op<(call "vcmla" # ROT, $p0, $p1,
+           (bitcast $p0, (dup_typed lanety , (call "vget_lane", (bitcast lanety, $p2), $p3))))>>;
+
+    // vcmlaq{ROT}_lane
+    def : SOpInst<"vcmla" # ROT # "_lane", "...qI", "Q" # type, Op<(call "vcmla" # ROT, $p0, $p1,
+           (bitcast $p0, (dup_typed laneqty , (call "vget_lane", (bitcast lanety, $p2), $p3))))>>;
+
+    let isLaneQ = 1 in  {
+      // vcmla{ROT}_laneq
+      def : SOpInst<"vcmla" # ROT # "_laneq", "...QI", type,  Op<(call "vcmla" # ROT, $p0, $p1,
+              (bitcast $p0, (dup_typed lanety, (call "vget_lane", (bitcast laneqty, $p2), $p3))))>>;
+
+      // vcmlaq{ROT}_laneq
+      def : SOpInst<"vcmla" # ROT # "_laneq", "...QI", "Q" # type, Op<(call "vcmla" # ROT, $p0, $p1,
+             (bitcast $p0, (dup_typed laneqty , (call "vget_lane", (bitcast laneqty, $p2), $p3))))>>;
+    }
   }
 }
 

diff  --git a/clang/test/CodeGen/aarch64-neon-vcmla.c b/clang/test/CodeGen/aarch64-neon-vcmla.c
index 2ecc1d5fbb3d..7b7a99d35b04 100644
--- a/clang/test/CodeGen/aarch64-neon-vcmla.c
+++ b/clang/test/CodeGen/aarch64-neon-vcmla.c
@@ -144,3 +144,323 @@ float32x4_t test_vcmlaq_rot270_f32(float32x4_t acc, float32x4_t lhs, float32x4_t
 float64x2_t test_vcmlaq_rot270_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
   return vcmlaq_rot270_f64(acc, lhs, rhs);
 }
+
+// CHECK-LABEL: @test_vcmla_lane_f16(
+// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
+// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
+// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
+// CHECK: ret <4 x half> [[RES]]
+float16x4_t test_vcmla_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
+  return vcmla_lane_f16(acc, lhs, rhs, 1);
+}
+
+// ACLE says this exists, but it won't map to a single instruction if lane > 1.
+// CHECK-LABEL: @test_vcmla_laneq_f16(
+// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
+// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
+// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
+// CHECK: ret <4 x half> [[RES]]
+float16x4_t test_vcmla_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
+  return vcmla_laneq_f16(acc, lhs, rhs, 3);
+}
+
+// CHECK-LABEL: @test_vcmlaq_lane_f16(
+// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
+// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
+// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
+// CHECK: ret <8 x half> [[RES]]
+float16x8_t test_vcmlaq_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
+  return vcmlaq_lane_f16(acc, lhs, rhs, 1);
+}
+
+// CHECK-LABEL: @test_vcmlaq_laneq_f16(
+// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
+// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
+// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
+// CHECK: ret <8 x half> [[RES]]
+float16x8_t test_vcmlaq_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
+  return vcmlaq_laneq_f16(acc, lhs, rhs, 3);
+}
+
+// CHECK-LABEL: @test_vcmla_lane_f32(
+// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
+// CHECK: ret <2 x float> [[RES]]
+float32x2_t test_vcmla_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
+  return vcmla_lane_f32(acc, lhs, rhs, 0);
+}
+
+// ACLE says this exists, but it won't map to a single instruction if lane > 1.
+// CHECK-LABEL: @test_vcmla_laneq_f32(
+// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
+// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <1 x i32> <i32 1>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
+// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
+// CHECK: ret <2 x float> [[RES]]
+float32x2_t test_vcmla_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
+  return vcmla_laneq_f32(acc, lhs, rhs, 1);
+}
+
+// CHECK-LABEL: @test_vcmlaq_lane_f32(
+// CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
+// CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i32 0
+// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> undef, <2 x i32> zeroinitializer
+// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: ret <4 x float> [[RES]]
+float32x4_t test_vcmlaq_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
+  return vcmlaq_lane_f32(acc, lhs, rhs, 0);
+}
+
+// CHECK-LABEL: @test_vcmlaq_laneq_f32(
+// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
+// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: ret <4 x float> [[RES]]
+float32x4_t test_vcmlaq_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
+  return vcmlaq_laneq_f32(acc, lhs, rhs, 1);
+}
+
+// CHECK-LABEL: @test_vcmla_rot90_lane_f16(
+// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
+// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
+// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
+// CHECK: ret <4 x half> [[RES]]
+float16x4_t test_vcmla_rot90_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
+  return vcmla_rot90_lane_f16(acc, lhs, rhs, 1);
+}
+
+// ACLE says this exists, but it won't map to a single instruction if lane > 1.
+// CHECK-LABEL: @test_vcmla_rot90_laneq_f16(
+// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
+// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
+// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
+// CHECK: ret <4 x half> [[RES]]
+float16x4_t test_vcmla_rot90_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
+  return vcmla_rot90_laneq_f16(acc, lhs, rhs, 3);
+}
+
+// CHECK-LABEL: @test_vcmlaq_rot90_lane_f16(
+// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
+// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
+// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
+// CHECK: ret <8 x half> [[RES]]
+float16x8_t test_vcmlaq_rot90_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
+  return vcmlaq_rot90_lane_f16(acc, lhs, rhs, 1);
+}
+
+// CHECK-LABEL: @test_vcmlaq_rot90_laneq_f16(
+// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
+// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
+// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
+// CHECK: ret <8 x half> [[RES]]
+float16x8_t test_vcmlaq_rot90_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
+  return vcmlaq_rot90_laneq_f16(acc, lhs, rhs, 3);
+}
+
+// CHECK-LABEL: @test_vcmla_rot90_lane_f32(
+// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
+// CHECK: ret <2 x float> [[RES]]
+float32x2_t test_vcmla_rot90_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
+  return vcmla_rot90_lane_f32(acc, lhs, rhs, 0);
+}
+
+// ACLE says this exists, but it won't map to a single instruction if lane > 1.
+// CHECK-LABEL: @test_vcmla_rot90_laneq_f32(
+// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
+// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <1 x i32> <i32 1>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
+// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
+// CHECK: ret <2 x float> [[RES]]
+float32x2_t test_vcmla_rot90_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
+  return vcmla_rot90_laneq_f32(acc, lhs, rhs, 1);
+}
+
+// CHECK-LABEL: @test_vcmlaq_rot90_lane_f32(
+// CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
+// CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i32 0
+// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> undef, <2 x i32> zeroinitializer
+// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: ret <4 x float> [[RES]]
+float32x4_t test_vcmlaq_rot90_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
+  return vcmlaq_rot90_lane_f32(acc, lhs, rhs, 0);
+}
+
+// CHECK-LABEL: @test_vcmlaq_rot90_laneq_f32(
+// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
+// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: ret <4 x float> [[RES]]
+float32x4_t test_vcmlaq_rot90_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
+  return vcmlaq_rot90_laneq_f32(acc, lhs, rhs, 1);
+}
+
+// CHECK-LABEL: @test_vcmla_rot180_lane_f16(
+// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
+// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
+// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
+// CHECK: ret <4 x half> [[RES]]
+float16x4_t test_vcmla_rot180_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
+  return vcmla_rot180_lane_f16(acc, lhs, rhs, 1);
+}
+
+// ACLE says this exists, but it won't map to a single instruction if lane > 1.
+// CHECK-LABEL: @test_vcmla_rot180_laneq_f16(
+// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
+// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
+// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
+// CHECK: ret <4 x half> [[RES]]
+float16x4_t test_vcmla_rot180_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
+  return vcmla_rot180_laneq_f16(acc, lhs, rhs, 3);
+}
+
+// CHECK-LABEL: @test_vcmlaq_rot180_lane_f16(
+// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
+// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
+// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
+// CHECK: ret <8 x half> [[RES]]
+float16x8_t test_vcmlaq_rot180_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
+  return vcmlaq_rot180_lane_f16(acc, lhs, rhs, 1);
+}
+
+// CHECK-LABEL: @test_vcmlaq_rot180_laneq_f16(
+// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
+// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
+// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
+// CHECK: ret <8 x half> [[RES]]
+float16x8_t test_vcmlaq_rot180_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
+  return vcmlaq_rot180_laneq_f16(acc, lhs, rhs, 3);
+}
+
+// CHECK-LABEL: @test_vcmla_rot180_lane_f32(
+// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
+// CHECK: ret <2 x float> [[RES]]
+float32x2_t test_vcmla_rot180_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
+  return vcmla_rot180_lane_f32(acc, lhs, rhs, 0);
+}
+
+// ACLE says this exists, but it won't map to a single instruction if lane > 1.
+// CHECK-LABEL: @test_vcmla_rot180_laneq_f32(
+// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
+// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <1 x i32> <i32 1>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
+// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
+// CHECK: ret <2 x float> [[RES]]
+float32x2_t test_vcmla_rot180_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
+  return vcmla_rot180_laneq_f32(acc, lhs, rhs, 1);
+}
+
+// CHECK-LABEL: @test_vcmlaq_rot180_lane_f32(
+// CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
+// CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i32 0
+// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> undef, <2 x i32> zeroinitializer
+// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: ret <4 x float> [[RES]]
+float32x4_t test_vcmlaq_rot180_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
+  return vcmlaq_rot180_lane_f32(acc, lhs, rhs, 0);
+}
+
+// CHECK-LABEL: @test_vcmlaq_rot180_laneq_f32(
+// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
+// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: ret <4 x float> [[RES]]
+float32x4_t test_vcmlaq_rot180_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
+  return vcmlaq_rot180_laneq_f32(acc, lhs, rhs, 1);
+}
+
+// CHECK-LABEL: @test_vcmla_rot270_lane_f16(
+// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
+// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
+// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
+// CHECK: ret <4 x half> [[RES]]
+float16x4_t test_vcmla_rot270_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
+  return vcmla_rot270_lane_f16(acc, lhs, rhs, 1);
+}
+
+// ACLE says this exists, but it won't map to a single instruction if lane > 1.
+// CHECK-LABEL: @test_vcmla_rot270_laneq_f16(
+// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
+// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
+// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
+// CHECK: ret <4 x half> [[RES]]
+float16x4_t test_vcmla_rot270_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
+  return vcmla_rot270_laneq_f16(acc, lhs, rhs, 3);
+}
+
+// CHECK-LABEL: @test_vcmlaq_rot270_lane_f16(
+// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
+// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
+// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
+// CHECK: ret <8 x half> [[RES]]
+float16x8_t test_vcmlaq_rot270_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
+  return vcmlaq_rot270_lane_f16(acc, lhs, rhs, 1);
+}
+
+// CHECK-LABEL: @test_vcmlaq_rot270_laneq_f16(
+// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
+// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
+// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
+// CHECK: ret <8 x half> [[RES]]
+float16x8_t test_vcmlaq_rot270_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
+  return vcmlaq_rot270_laneq_f16(acc, lhs, rhs, 3);
+}
+
+// CHECK-LABEL: @test_vcmla_rot270_lane_f32(
+// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
+// CHECK: ret <2 x float> [[RES]]
+float32x2_t test_vcmla_rot270_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
+  return vcmla_rot270_lane_f32(acc, lhs, rhs, 0);
+}
+
+// ACLE says this exists, but it won't map to a single instruction if lane > 1.
+// CHECK-LABEL: @test_vcmla_rot270_laneq_f32(
+// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
+// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <1 x i32> <i32 1>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
+// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
+// CHECK: ret <2 x float> [[RES]]
+float32x2_t test_vcmla_rot270_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
+  return vcmla_rot270_laneq_f32(acc, lhs, rhs, 1);
+}
+
+// CHECK-LABEL: @test_vcmlaq_rot270_lane_f32(
+// CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
+// CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i32 0
+// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> undef, <2 x i32> zeroinitializer
+// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: ret <4 x float> [[RES]]
+float32x4_t test_vcmlaq_rot270_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
+  return vcmlaq_rot270_lane_f32(acc, lhs, rhs, 0);
+}
+
+// CHECK-LABEL: @test_vcmlaq_rot270_laneq_f32(
+// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
+// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: ret <4 x float> [[RES]]
+float32x4_t test_vcmlaq_rot270_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
+  return vcmlaq_rot270_laneq_f32(acc, lhs, rhs, 1);
+}

diff  --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index 8ca5404508b1..ba952f220037 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -1690,14 +1690,18 @@ std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagDup(DagInit *DI) {
 
 std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagDupTyped(DagInit *DI) {
   assert_with_loc(DI->getNumArgs() == 2, "dup_typed() expects two arguments");
-  std::pair<Type, std::string> A =
-      emitDagArg(DI->getArg(0), std::string(DI->getArgNameStr(0)));
   std::pair<Type, std::string> B =
       emitDagArg(DI->getArg(1), std::string(DI->getArgNameStr(1)));
   assert_with_loc(B.first.isScalar(),
                   "dup_typed() requires a scalar as the second argument");
+  Type T;
+  // If the type argument is a constant string, construct the type directly.
+  if (StringInit *SI = dyn_cast<StringInit>(DI->getArg(0))) {
+    T = Type::fromTypedefName(SI->getAsUnquotedString());
+    assert_with_loc(!T.isVoid(), "Unknown typedef");
+  } else
+    T = emitDagArg(DI->getArg(0), std::string(DI->getArgNameStr(0))).first;
 
-  Type T = A.first;
   assert_with_loc(T.isVector(), "dup_typed() used but target type is scalar!");
   std::string S = "(" + T.str() + ") {";
   for (unsigned I = 0; I < T.getNumElements(); ++I) {