[llvm] 1fec215 - [ARM][AArch64] Cleanup and autogenerate v8.1a vqdrmlah tests. NFC

Thu Jan 27 10:43:12 PST 2022

Author: David Green
Date: 2022-01-27T18:43:06Z
New Revision: 1fec2154b29f84b53dd578b9f87f34e255630771

URL: https://github.com/llvm/llvm-project/commit/1fec2154b29f84b53dd578b9f87f34e255630771
DIFF: https://github.com/llvm/llvm-project/commit/1fec2154b29f84b53dd578b9f87f34e255630771.diff

LOG: [ARM][AArch64] Cleanup and autogenerate v8.1a vqdrmlah tests. NFC

Added: 
    

Modified: 
    clang/test/CodeGen/aarch64-v8.1a-neon-intrinsics.c
    clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c
    llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
    llvm/test/CodeGen/ARM/neon-v8.1a.ll

Removed: 
    


################################################################################
diff  --git a/clang/test/CodeGen/aarch64-v8.1a-neon-intrinsics.c b/clang/test/CodeGen/aarch64-v8.1a-neon-intrinsics.c
index 7abfa3bda7281..ac583eddaecaa 100644

--- a/clang/test/CodeGen/aarch64-v8.1a-neon-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-v8.1a-neon-intrinsics.c
@@ -1,198 +1,275 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \
-// RUN:  -target-feature +v8.1a -S -emit-llvm -o - %s | FileCheck %s
+// RUN:  -target-feature +v8.1a -S -emit-llvm -disable-O0-optnone -o - %s | opt -mem2reg -dce -S | FileCheck %s
 
 // REQUIRES: aarch64-registered-target
 
  #include <arm_neon.h>
 
-// CHECK-LABEL: test_vqrdmlah_laneq_s16
+// CHECK-LABEL: @test_vqrdmlah_laneq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR4:[0-9]+]]
+// CHECK-NEXT:    [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x i16> [[VQADD_V2_I]]
+//
 int16x4_t test_vqrdmlah_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
-// CHECK: shufflevector <8 x i16> {{%.*}}, <8 x i16> {{%.*}}, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK: call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
-// CHECK: call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
   return vqrdmlah_laneq_s16(a, b, v, 7);
 }
 
-// CHECK-LABEL: test_vqrdmlah_laneq_s32
+// CHECK-LABEL: @test_vqrdmlah_laneq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <2 x i32> [[VQADD_V2_I]]
+//
 int32x2_t test_vqrdmlah_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
-// CHECK: shufflevector <4 x i32> {{%.*}}, <4 x i32> {{%.*}}, <2 x i32> <i32 3, i32 3>
-// CHECK: call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
-// CHECK: call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
   return vqrdmlah_laneq_s32(a, b, v, 3);
 }
 
-// CHECK-LABEL: test_vqrdmlahq_laneq_s16
+// CHECK-LABEL: @test_vqrdmlahq_laneq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x i16> [[VQADDQ_V2_I]]
+//
 int16x8_t test_vqrdmlahq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
-// CHECK: shufflevector <8 x i16> {{%.*}}, <8 x i16> {{%.*}}, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK: call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
-// CHECK: call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
   return vqrdmlahq_laneq_s16(a, b, v, 7);
 }
 
-// CHECK-LABEL: test_vqrdmlahq_laneq_s32
+// CHECK-LABEL: @test_vqrdmlahq_laneq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x i32> [[VQADDQ_V2_I]]
+//
 int32x4_t test_vqrdmlahq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
-// CHECK: shufflevector <4 x i32> {{%.*}}, <4 x i32> {{%.*}}, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
-// CHECK: call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
   return vqrdmlahq_laneq_s32(a, b, v, 3);
 }
 
-// CHECK-LABEL: test_vqrdmlahh_s16
+// CHECK-LABEL: @test_vqrdmlahh_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i64 0
+// CHECK-NEXT:    [[VQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[VQADDH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP5]]
+//
 int16_t test_vqrdmlahh_s16(int16_t a, int16_t b, int16_t c) {
-// CHECK: [[insb:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[insc:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[mul:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[insb]], <4 x i16> [[insc]])
-// CHECK: extractelement <4 x i16> [[mul]], i64 0
-// CHECK: [[insa:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[insmul:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[add:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[insa]], <4 x i16> [[insmul]])
-// CHECK: extractelement <4 x i16> [[add]], i64 0
   return vqrdmlahh_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vqrdmlahs_s32
+// CHECK-LABEL: @test_vqrdmlahs_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A:%.*]], i32 [[VQRDMULHS_S32_I]]) #[[ATTR4]]
+// CHECK-NEXT:    ret i32 [[VQADDS_S32_I]]
+//
 int32_t test_vqrdmlahs_s32(int32_t a, int32_t b, int32_t c) {
-// CHECK: call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 {{%.*}}, i32 {{%.*}})
-// CHECK: call i32 @llvm.aarch64.neon.sqadd.i32(i32 {{%.*}}, i32 {{%.*}})
   return vqrdmlahs_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vqrdmlahh_lane_s16
+// CHECK-LABEL: @test_vqrdmlahh_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[C:%.*]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
+// CHECK-NEXT:    [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i64 0
+// CHECK-NEXT:    [[VQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[VQADDH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP5]]
+//
 int16_t test_vqrdmlahh_lane_s16(int16_t a, int16_t b, int16x4_t c) {
-// CHECK: extractelement <4 x i16> {{%.*}}, i32 3
-// CHECK: [[insb:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[insc:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[mul:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[insb]], <4 x i16> [[insc]])
-// CHECK: extractelement <4 x i16> [[mul]], i64 0
-// CHECK: [[insa:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[insmul:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[add:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[insa]], <4 x i16> [[insmul]])
-// CHECK: extractelement <4 x i16> [[add]], i64 0
   return vqrdmlahh_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vqrdmlahs_lane_s32
+// CHECK-LABEL: @test_vqrdmlahs_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 1
+// CHECK-NEXT:    [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[B:%.*]], i32 [[VGET_LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A:%.*]], i32 [[VQRDMULHS_S32_I]]) #[[ATTR4]]
+// CHECK-NEXT:    ret i32 [[VQADDS_S32_I]]
+//
 int32_t test_vqrdmlahs_lane_s32(int32_t a, int32_t b, int32x2_t c) {
-// CHECK: extractelement <2 x i32> {{%.*}}, i32 1
-// CHECK: call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 {{%.*}}, i32 {{%.*}})
-// CHECK: call i32 @llvm.aarch64.neon.sqadd.i32(i32 {{%.*}}, i32 {{%.*}})
   return vqrdmlahs_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vqrdmlahh_laneq_s16
+// CHECK-LABEL: @test_vqrdmlahh_laneq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[C:%.*]], i32 7
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
+// CHECK-NEXT:    [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i64 0
+// CHECK-NEXT:    [[VQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[VQADDH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP5]]
+//
 int16_t test_vqrdmlahh_laneq_s16(int16_t a, int16_t b, int16x8_t c) {
-// CHECK: extractelement <8 x i16> {{%.*}}, i32 7
-// CHECK: [[insb:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[insc:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[mul:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[insb]], <4 x i16> [[insc]])
-// CHECK: extractelement <4 x i16> [[mul]], i64 0
-// CHECK: [[insa:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[insmul:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[add:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[insa]], <4 x i16> [[insmul]])
-// CHECK: extractelement <4 x i16> [[add]], i64 0
   return vqrdmlahh_laneq_s16(a, b, c, 7);
 }
 
-// CHECK-LABEL: test_vqrdmlahs_laneq_s32
+// CHECK-LABEL: @test_vqrdmlahs_laneq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 3
+// CHECK-NEXT:    [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[B:%.*]], i32 [[VGETQ_LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A:%.*]], i32 [[VQRDMULHS_S32_I]]) #[[ATTR4]]
+// CHECK-NEXT:    ret i32 [[VQADDS_S32_I]]
+//
 int32_t test_vqrdmlahs_laneq_s32(int32_t a, int32_t b, int32x4_t c) {
-// CHECK: extractelement <4 x i32> {{%.*}}, i32 3
-// CHECK: call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 {{%.*}}, i32 {{%.*}})
-// CHECK: call i32 @llvm.aarch64.neon.sqadd.i32(i32 {{%.*}}, i32 {{%.*}})
   return vqrdmlahs_laneq_s32(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vqrdmlsh_laneq_s16
+// CHECK-LABEL: @test_vqrdmlsh_laneq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x i16> [[VQSUB_V2_I]]
+//
 int16x4_t test_vqrdmlsh_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
-// CHECK: shufflevector <8 x i16> {{%.*}}, <8 x i16> {{%.*}}, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK: call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
-// CHECK: call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
   return vqrdmlsh_laneq_s16(a, b, v, 7);
 }
 
-// CHECK-LABEL: test_vqrdmlsh_laneq_s32
+// CHECK-LABEL: @test_vqrdmlsh_laneq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <2 x i32> [[VQSUB_V2_I]]
+//
 int32x2_t test_vqrdmlsh_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
-// CHECK: shufflevector <4 x i32> {{%.*}}, <4 x i32> {{%.*}}, <2 x i32> <i32 3, i32 3>
-// CHECK: call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
-// CHECK: call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
   return vqrdmlsh_laneq_s32(a, b, v, 3);
 }
 
-// CHECK-LABEL: test_vqrdmlshq_laneq_s16
+// CHECK-LABEL: @test_vqrdmlshq_laneq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x i16> [[VQSUBQ_V2_I]]
+//
 int16x8_t test_vqrdmlshq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
-// CHECK: shufflevector <8 x i16> {{%.*}}, <8 x i16> {{%.*}}, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK: call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
-// CHECK: call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
   return vqrdmlshq_laneq_s16(a, b, v, 7);
 }
 
-// CHECK-LABEL: test_vqrdmlshq_laneq_s32
+// CHECK-LABEL: @test_vqrdmlshq_laneq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x i32> [[VQSUBQ_V2_I]]
+//
 int32x4_t test_vqrdmlshq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
-// CHECK: shufflevector <4 x i32> {{%.*}}, <4 x i32> {{%.*}}, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
-// CHECK: call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
   return vqrdmlshq_laneq_s32(a, b, v, 3);
 }
 
-// CHECK-LABEL: test_vqrdmlshh_s16
+// CHECK-LABEL: @test_vqrdmlshh_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VQRDMULHH_S16_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I_I]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i64 0
+// CHECK-NEXT:    [[VQSUBH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[VQSUBH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP5]]
+//
 int16_t test_vqrdmlshh_s16(int16_t a, int16_t b, int16_t c) {
-// CHECK: [[insb:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[insc:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[mul:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[insb]], <4 x i16> [[insc]])
-// CHECK: extractelement <4 x i16> [[mul]], i64 0
-// CHECK: [[insa:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[insmul:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[sub:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[insa]], <4 x i16> [[insmul]])
-// CHECK: extractelement <4 x i16> [[sub]], i64 0
   return vqrdmlshh_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vqrdmlshs_s32
+// CHECK-LABEL: @test_vqrdmlshs_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VQRDMULHS_S32_I_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQSUBS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A:%.*]], i32 [[VQRDMULHS_S32_I_I]]) #[[ATTR4]]
+// CHECK-NEXT:    ret i32 [[VQSUBS_S32_I]]
+//
 int32_t test_vqrdmlshs_s32(int32_t a, int32_t b, int32_t c) {
-// CHECK: call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 {{%.*}}, i32 {{%.*}})
-// CHECK: call i32 @llvm.aarch64.neon.sqsub.i32(i32 {{%.*}}, i32 {{%.*}})
   return vqrdmlshs_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vqrdmlshh_lane_s16
+// CHECK-LABEL: @test_vqrdmlshh_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[C:%.*]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
+// CHECK-NEXT:    [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i64 0
+// CHECK-NEXT:    [[VQSUBH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[VQSUBH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP5]]
+//
 int16_t test_vqrdmlshh_lane_s16(int16_t a, int16_t b, int16x4_t c) {
-// CHECK: extractelement <4 x i16> {{%.*}}, i32 3
-// CHECK: [[insb:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[insc:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[mul:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[insb]], <4 x i16> [[insc]])
-// CHECK: extractelement <4 x i16> [[mul]], i64 0
-// CHECK: [[insa:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[insmul:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[sub:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[insa]], <4 x i16> [[insmul]])
-// CHECK: extractelement <4 x i16> [[sub]], i64 0
   return vqrdmlshh_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vqrdmlshs_lane_s32
+// CHECK-LABEL: @test_vqrdmlshs_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 1
+// CHECK-NEXT:    [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[B:%.*]], i32 [[VGET_LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQSUBS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A:%.*]], i32 [[VQRDMULHS_S32_I]]) #[[ATTR4]]
+// CHECK-NEXT:    ret i32 [[VQSUBS_S32_I]]
+//
 int32_t test_vqrdmlshs_lane_s32(int32_t a, int32_t b, int32x2_t c) {
-// CHECK: extractelement <2 x i32> {{%.*}}, i32 1
-// CHECK: call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 {{%.*}}, i32 {{%.*}})
-// CHECK: call i32 @llvm.aarch64.neon.sqsub.i32(i32 {{%.*}}, i32 {{%.*}})
   return vqrdmlshs_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vqrdmlshh_laneq_s16
+// CHECK-LABEL: @test_vqrdmlshh_laneq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[C:%.*]], i32 7
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
+// CHECK-NEXT:    [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i64 0
+// CHECK-NEXT:    [[VQSUBH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i16> [[VQSUBH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP5]]
+//
 int16_t test_vqrdmlshh_laneq_s16(int16_t a, int16_t b, int16x8_t c) {
-// CHECK: extractelement <8 x i16> {{%.*}}, i32 7
-// CHECK: [[insb:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[insc:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[mul:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[insb]], <4 x i16> [[insc]])
-// CHECK: extractelement <4 x i16> [[mul]], i64 0
-// CHECK: [[insa:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[insmul:%.*]] = insertelement <4 x i16> undef, i16 {{%.*}}, i64 0
-// CHECK: [[sub:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[insa]], <4 x i16> [[insmul]])
-// CHECK: extractelement <4 x i16> [[sub]], i64 0
   return vqrdmlshh_laneq_s16(a, b, c, 7);
 }
 
-// CHECK-LABEL: test_vqrdmlshs_laneq_s32
+// CHECK-LABEL: @test_vqrdmlshs_laneq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 3
+// CHECK-NEXT:    [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[B:%.*]], i32 [[VGETQ_LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQSUBS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A:%.*]], i32 [[VQRDMULHS_S32_I]]) #[[ATTR4]]
+// CHECK-NEXT:    ret i32 [[VQSUBS_S32_I]]
+//
 int32_t test_vqrdmlshs_laneq_s32(int32_t a, int32_t b, int32x4_t c) {
-// CHECK: extractelement <4 x i32> {{%.*}}, i32 3
-// CHECK: call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 {{%.*}}, i32 {{%.*}})
-// CHECK: call i32 @llvm.aarch64.neon.sqsub.i32(i32 {{%.*}}, i32 {{%.*}})
   return vqrdmlshs_laneq_s32(a, b, c, 3);
 }

diff  --git a/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c b/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c
index 5462c17a1cc50..194b4863e33fe 100644
--- a/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c
+++ b/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c
@@ -1,187 +1,332 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1 -triple armv8.1a-linux-gnu -target-abi apcs-gnu -target-feature +neon \
-// RUN:  -S -emit-llvm -o - %s \
-// RUN:  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM
+// RUN:  -S -emit-llvm -o - %s -disable-O0-optnone | opt -mem2reg -dce -S \
+// RUN:  | FileCheck %s --check-prefix=CHECK-ARM
 
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \
-// RUN:  -target-feature +v8.1a -S -emit-llvm -o - %s \
-// RUN:  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH64
+// RUN:  -target-feature +v8.1a -S -emit-llvm -o - %s -disable-O0-optnone | opt -mem2reg -dce -S \
+// RUN:  | FileCheck %s --check-prefix=CHECK-AARCH64
 
 // REQUIRES: arm-registered-target,aarch64-registered-target
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: test_vqrdmlah_s16
+// CHECK-ARM-LABEL: @test_vqrdmlah_s16(
+// CHECK-ARM-NEXT:  entry:
+// CHECK-ARM-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR4:[0-9]+]]
+// CHECK-ARM-NEXT:    [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    ret <4 x i16> [[VQADD_V2_I]]
+//
+// CHECK-AARCH64-LABEL: @test_vqrdmlah_s16(
+// CHECK-AARCH64-NEXT:  entry:
+// CHECK-AARCH64-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR3:[0-9]+]]
+// CHECK-AARCH64-NEXT:    [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    ret <4 x i16> [[VQADD_V2_I]]
+//
 int16x4_t test_vqrdmlah_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
-// CHECK-ARM: call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
-// CHECK-ARM: call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
 
-// CHECK-AARCH64: call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
-// CHECK-AARCH64: call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
   return vqrdmlah_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vqrdmlah_s32
+// CHECK-ARM-LABEL: @test_vqrdmlah_s32(
+// CHECK-ARM-NEXT:  entry:
+// CHECK-ARM-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    ret <2 x i32> [[VQADD_V2_I]]
+//
+// CHECK-AARCH64-LABEL: @test_vqrdmlah_s32(
+// CHECK-AARCH64-NEXT:  entry:
+// CHECK-AARCH64-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    ret <2 x i32> [[VQADD_V2_I]]
+//
 int32x2_t test_vqrdmlah_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
-// CHECK-ARM: call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
-// CHECK-ARM: call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
 
-// CHECK-AARCH64: call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
-// CHECK-AARCH64: call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
   return vqrdmlah_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vqrdmlahq_s16
+// CHECK-ARM-LABEL: @test_vqrdmlahq_s16(
+// CHECK-ARM-NEXT:  entry:
+// CHECK-ARM-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    ret <8 x i16> [[VQADDQ_V2_I]]
+//
+// CHECK-AARCH64-LABEL: @test_vqrdmlahq_s16(
+// CHECK-AARCH64-NEXT:  entry:
+// CHECK-AARCH64-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    ret <8 x i16> [[VQADDQ_V2_I]]
+//
 int16x8_t test_vqrdmlahq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
-// CHECK-ARM: call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
-// CHECK-ARM: call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
 
-// CHECK-AARCH64: call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
-// CHECK-AARCH64: call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
   return vqrdmlahq_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vqrdmlahq_s32
+// CHECK-ARM-LABEL: @test_vqrdmlahq_s32(
+// CHECK-ARM-NEXT:  entry:
+// CHECK-ARM-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    ret <4 x i32> [[VQADDQ_V2_I]]
+//
+// CHECK-AARCH64-LABEL: @test_vqrdmlahq_s32(
+// CHECK-AARCH64-NEXT:  entry:
+// CHECK-AARCH64-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    ret <4 x i32> [[VQADDQ_V2_I]]
+//
 int32x4_t test_vqrdmlahq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
-// CHECK-ARM: call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
-// CHECK-ARM: call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
 
-// CHECK-AARCH64: call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
-// CHECK-AARCH64: call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
   return vqrdmlahq_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vqrdmlah_lane_s16
+// CHECK-ARM-LABEL: @test_vqrdmlah_lane_s16(
+// CHECK-ARM-NEXT:  entry:
+// CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
+// CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-ARM-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    ret <4 x i16> [[VQADD_V2_I]]
+//
+// CHECK-AARCH64-LABEL: @test_vqrdmlah_lane_s16(
+// CHECK-AARCH64-NEXT:  entry:
+// CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
+// CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-AARCH64-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    ret <4 x i16> [[VQADD_V2_I]]
+//
 int16x4_t test_vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
-// CHECK-ARM: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-ARM: call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
-// CHECK-ARM: call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
 
-// CHECK-AARCH64: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-AARCH64: call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
-// CHECK-AARCH64: call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
   return vqrdmlah_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vqrdmlah_lane_s32
+// CHECK-ARM-LABEL: @test_vqrdmlah_lane_s32(
+// CHECK-ARM-NEXT:  entry:
+// CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
+// CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-ARM-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    ret <2 x i32> [[VQADD_V2_I]]
+//
+// CHECK-AARCH64-LABEL: @test_vqrdmlah_lane_s32(
+// CHECK-AARCH64-NEXT:  entry:
+// CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
+// CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-AARCH64-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    ret <2 x i32> [[VQADD_V2_I]]
+//
 int32x2_t test_vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
-// CHECK-ARM: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> <i32 1, i32 1>
-// CHECK-ARM: call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
-// CHECK-ARM: call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
 
-// CHECK-AARCH64: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> <i32 1, i32 1>
-// CHECK-AARCH64: call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
-// CHECK-AARCH64: call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
   return vqrdmlah_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vqrdmlahq_lane_s16
+// CHECK-ARM-LABEL: @test_vqrdmlahq_lane_s16(
+// CHECK-ARM-NEXT:  entry:
+// CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
+// CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-ARM-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    ret <8 x i16> [[VQADDQ_V2_I]]
+//
+// CHECK-AARCH64-LABEL: @test_vqrdmlahq_lane_s16(
+// CHECK-AARCH64-NEXT:  entry:
+// CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
+// CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-AARCH64-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    ret <8 x i16> [[VQADDQ_V2_I]]
+//
 int16x8_t test_vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
-// CHECK-ARM: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-ARM: call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
-// CHECK-ARM: call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
 
-// CHECK-AARCH64: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-AARCH64: call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
-// CHECK-AARCH64: call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
   return vqrdmlahq_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vqrdmlahq_lane_s32
+// CHECK-ARM-LABEL: @test_vqrdmlahq_lane_s32(
+// CHECK-ARM-NEXT:  entry:
+// CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
+// CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-ARM-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    ret <4 x i32> [[VQADDQ_V2_I]]
+//
+// CHECK-AARCH64-LABEL: @test_vqrdmlahq_lane_s32(
+// CHECK-AARCH64-NEXT:  entry:
+// CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
+// CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-AARCH64-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    ret <4 x i32> [[VQADDQ_V2_I]]
+//
 int32x4_t test_vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
-// CHECK-ARM: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-ARM: call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
-// CHECK-ARM: call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
 
-// CHECK-AARCH64: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-AARCH64: call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
-// CHECK-AARCH64: call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
   return vqrdmlahq_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vqrdmlsh_s16
+// CHECK-ARM-LABEL: @test_vqrdmlsh_s16(
+// CHECK-ARM-NEXT:  entry:
+// CHECK-ARM-NEXT:    [[VQRDMULH_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I_I]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    ret <4 x i16> [[VQSUB_V2_I]]
+//
+// CHECK-AARCH64-LABEL: @test_vqrdmlsh_s16(
+// CHECK-AARCH64-NEXT:  entry:
+// CHECK-AARCH64-NEXT:    [[VQRDMULH_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I_I]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    ret <4 x i16> [[VQSUB_V2_I]]
+//
 int16x4_t test_vqrdmlsh_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
-// CHECK-ARM: call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
-// CHECK-ARM: call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
 
-// CHECK-AARCH64: call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
-// CHECK-AARCH64: call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
   return vqrdmlsh_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vqrdmlsh_s32
+// CHECK-ARM-LABEL: @test_vqrdmlsh_s32(
+// CHECK-ARM-NEXT:  entry:
+// CHECK-ARM-NEXT:    [[VQRDMULH_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I_I]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    ret <2 x i32> [[VQSUB_V2_I]]
+//
+// CHECK-AARCH64-LABEL: @test_vqrdmlsh_s32(
+// CHECK-AARCH64-NEXT:  entry:
+// CHECK-AARCH64-NEXT:    [[VQRDMULH_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I_I]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    ret <2 x i32> [[VQSUB_V2_I]]
+//
 int32x2_t test_vqrdmlsh_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
-// CHECK-ARM: call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
-// CHECK-ARM: call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
 
-// CHECK-AARCH64: call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
-// CHECK-AARCH64: call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
   return vqrdmlsh_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vqrdmlshq_s16
+// CHECK-ARM-LABEL: @test_vqrdmlshq_s16(
+// CHECK-ARM-NEXT:  entry:
+// CHECK-ARM-NEXT:    [[VQRDMULHQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I_I]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    ret <8 x i16> [[VQSUBQ_V2_I]]
+//
+// CHECK-AARCH64-LABEL: @test_vqrdmlshq_s16(
+// CHECK-AARCH64-NEXT:  entry:
+// CHECK-AARCH64-NEXT:    [[VQRDMULHQ_V2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I_I]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    ret <8 x i16> [[VQSUBQ_V2_I]]
+//
 int16x8_t test_vqrdmlshq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
-// CHECK-ARM: call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
-// CHECK-ARM: call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
 
-// CHECK-AARCH64: call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
-// CHECK-AARCH64: call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
   return vqrdmlshq_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vqrdmlshq_s32
+// CHECK-ARM-LABEL: @test_vqrdmlshq_s32(
+// CHECK-ARM-NEXT:  entry:
+// CHECK-ARM-NEXT:    [[VQRDMULHQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I_I]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    ret <4 x i32> [[VQSUBQ_V2_I]]
+//
+// CHECK-AARCH64-LABEL: @test_vqrdmlshq_s32(
+// CHECK-AARCH64-NEXT:  entry:
+// CHECK-AARCH64-NEXT:    [[VQRDMULHQ_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I_I]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    ret <4 x i32> [[VQSUBQ_V2_I]]
+//
 int32x4_t test_vqrdmlshq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
-// CHECK-ARM: call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
-// CHECK-ARM: call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
 
-// CHECK-AARCH64: call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
-// CHECK-AARCH64: call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
   return vqrdmlshq_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vqrdmlsh_lane_s16
+// CHECK-ARM-LABEL: @test_vqrdmlsh_lane_s16(
+// CHECK-ARM-NEXT:  entry:
+// CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
+// CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-ARM-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    ret <4 x i16> [[VQSUB_V2_I]]
+//
+// CHECK-AARCH64-LABEL: @test_vqrdmlsh_lane_s16(
+// CHECK-AARCH64-NEXT:  entry:
+// CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
+// CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-AARCH64-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    ret <4 x i16> [[VQSUB_V2_I]]
+//
 int16x4_t test_vqrdmlsh_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
-// CHECK-ARM: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-ARM: call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
-// CHECK-ARM: call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
 
-// CHECK-AARCH64: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-AARCH64: call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
-// CHECK-AARCH64: call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
   return vqrdmlsh_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vqrdmlsh_lane_s32
+// CHECK-ARM-LABEL: @test_vqrdmlsh_lane_s32(
+// CHECK-ARM-NEXT:  entry:
+// CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
+// CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-ARM-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    ret <2 x i32> [[VQSUB_V2_I]]
+//
+// CHECK-AARCH64-LABEL: @test_vqrdmlsh_lane_s32(
+// CHECK-AARCH64-NEXT:  entry:
+// CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
+// CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-AARCH64-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    ret <2 x i32> [[VQSUB_V2_I]]
+//
 int32x2_t test_vqrdmlsh_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
-// CHECK-ARM: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> <i32 1, i32 1>
-// CHECK-ARM: call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
-// CHECK-ARM: call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
 
-// CHECK-AARCH64: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> <i32 1, i32 1>
-// CHECK-AARCH64: call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
-// CHECK-AARCH64: call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
   return vqrdmlsh_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vqrdmlshq_lane_s16
+// CHECK-ARM-LABEL: @test_vqrdmlshq_lane_s16(
+// CHECK-ARM-NEXT:  entry:
+// CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
+// CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-ARM-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    ret <8 x i16> [[VQSUBQ_V2_I]]
+//
+// CHECK-AARCH64-LABEL: @test_vqrdmlshq_lane_s16(
+// CHECK-AARCH64-NEXT:  entry:
+// CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
+// CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-AARCH64-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    ret <8 x i16> [[VQSUBQ_V2_I]]
+//
 int16x8_t test_vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
-// CHECK-ARM: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-ARM: call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
-// CHECK-ARM: call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
 
-// CHECK-AARCH64: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-AARCH64: call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
-// CHECK-AARCH64: call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
   return vqrdmlshq_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vqrdmlshq_lane_s32
+// CHECK-ARM-LABEL: @test_vqrdmlshq_lane_s32(
+// CHECK-ARM-NEXT:  entry:
+// CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
+// CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-ARM-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR4]]
+// CHECK-ARM-NEXT:    ret <4 x i32> [[VQSUBQ_V2_I]]
+//
+// CHECK-AARCH64-LABEL: @test_vqrdmlshq_lane_s32(
+// CHECK-AARCH64-NEXT:  entry:
+// CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
+// CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-AARCH64-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    ret <4 x i32> [[VQSUBQ_V2_I]]
+//
 int32x4_t test_vqrdmlshq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
-// CHECK-ARM: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-ARM: call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
-// CHECK-ARM: call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
 
-// CHECK-AARCH64: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-AARCH64: call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
-// CHECK-AARCH64: call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
   return vqrdmlshq_lane_s32(a, b, c, 1);
 }

diff  --git a/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll b/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
index 3e15bd85cd219..3efa931404c15 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
@@ -1,30 +1,24 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V8a
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+rdm -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mcpu=falkor -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+v8.1a -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mcpu=saphira -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+v8.1a -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-V81a-apple
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+rdm | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+v8.1a | FileCheck %s
 
 declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
 declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
 declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
 declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
 declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32)
-declare i16 @llvm.aarch64.neon.sqrdmulh.i16(i16, i16)
 
 declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>)
 declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>)
 declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>)
 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
 declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
-declare i16 @llvm.aarch64.neon.sqadd.i16(i16, i16)
 
 declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>)
 declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>)
 declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>)
 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
 declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
-declare i16 @llvm.aarch64.neon.sqsub.i16(i16, i16)
 
 ;-----------------------------------------------------------------------------
 ; RDMA Vector
@@ -32,81 +26,81 @@ declare i16 @llvm.aarch64.neon.sqsub.i16(i16, i16)
 
 define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
 ; CHECK-LABEL: test_sqrdmlah_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqrdmlah v0.4h, v1.4h, v2.4h
+; CHECK-NEXT:    ret
    %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
    %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc,  <4 x i16> %prod)
-; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.4h
-; CHECK-V81a:       sqrdmlah    v0.4h, v1.4h, v2.4h
-; CHECK-V81a-apple: sqrdmlah.4h v0,    v1,    v2
    ret <4 x i16> %retval
 }
 
 define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
 ; CHECK-LABEL: test_sqrdmlah_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqrdmlah v0.8h, v1.8h, v2.8h
+; CHECK-NEXT:    ret
    %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
    %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
-; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.8h
-; CHECK-V81a:       sqrdmlah    v0.8h, v1.8h, v2.8h
-; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2
    ret <8 x i16> %retval
 }
 
 define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
 ; CHECK-LABEL: test_sqrdmlah_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqrdmlah v0.2s, v1.2s, v2.2s
+; CHECK-NEXT:    ret
    %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
    %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
-; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.2s
-; CHECK-V81a:       sqrdmlah    v0.2s, v1.2s, v2.2s
-; CHECK-V81a-apple: sqrdmlah.2s v0,    v1,    v2
    ret <2 x i32> %retval
 }
 
 define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
 ; CHECK-LABEL: test_sqrdmlah_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqrdmlah v0.4s, v1.4s, v2.4s
+; CHECK-NEXT:    ret
    %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
    %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
-; CHECK-V81:        sqrdmulh    v1.4s, v1.4s, v2.4s
-; CHECK-V81a:       sqrdmlah    v0.4s, v1.4s, v2.4s
-; CHECK-V81a-apple: sqrdmlah.4s v0,    v1,    v2
    ret <4 x i32> %retval
 }
 
 define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
 ; CHECK-LABEL: test_sqrdmlsh_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqrdmlsh v0.4h, v1.4h, v2.4h
+; CHECK-NEXT:    ret
    %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
    %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
-; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.4h
-; CHECK-V81a:       sqrdmlsh    v0.4h, v1.4h, v2.4h
-; CHECK-V81a-apple: sqrdmlsh.4h v0,    v1,    v2
    ret <4 x i16> %retval
 }
 
 define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
 ; CHECK-LABEL: test_sqrdmlsh_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqrdmlsh v0.8h, v1.8h, v2.8h
+; CHECK-NEXT:    ret
    %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
    %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
-; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.8h
-; CHECK-V81a:       sqrdmlsh    v0.8h, v1.8h, v2.8h
-; CHECK-V81a-apple: sqrdmlsh.8h v0,    v1,    v2
    ret <8 x i16> %retval
 }
 
 define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
 ; CHECK-LABEL: test_sqrdmlsh_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqrdmlsh v0.2s, v1.2s, v2.2s
+; CHECK-NEXT:    ret
    %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
    %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
-; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.2s
-; CHECK-V81a:       sqrdmlsh    v0.2s, v1.2s, v2.2s
-; CHECK-V81a-apple: sqrdmlsh.2s v0,    v1,    v2
    ret <2 x i32> %retval
 }
 
 define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
 ; CHECK-LABEL: test_sqrdmlsh_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqrdmlsh v0.4s, v1.4s, v2.4s
+; CHECK-NEXT:    ret
    %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
    %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
-; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.4s
-; CHECK-V81a:       sqrdmlsh    v0.4s, v1.4s, v2.4s
-; CHECK-V81a-apple: sqrdmlsh.4s v0,    v1,    v2
    ret <4 x i32> %retval
 }
 
@@ -116,97 +110,101 @@ define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32>
 
 define <4 x i16> @test_sqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
 ; CHECK-LABEL: test_sqrdmlah_lane_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sqrdmlah v0.4h, v1.4h, v2.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
   %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
-; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.h[3]
-; CHECK-V81a:       sqrdmlah    v0.4h, v1.4h, v2.h[3]
-; CHECK-V81a-apple: sqrdmlah.4h v0,    v1,    v2[3]
   ret <4 x i16> %retval
 }
 
 define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
 ; CHECK-LABEL: test_sqrdmlahq_lane_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmlah v0.8h, v1.8h, v2.h[2]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
   %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
-; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.h[2]
-; CHECK-V81a:       sqrdmlah    v0.8h, v1.8h, v2.h[2]
-; CHECK-V81a-apple: sqrdmlah.8h v0,    v1,    v2[2]
   ret <8 x i16> %retval
 }
 
 define <2 x i32> @test_sqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
 ; CHECK-LABEL: test_sqrdmlah_lane_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sqrdmlah v0.2s, v1.2s, v2.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
   %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
-; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.s[1]
-; CHECK-V81a:       sqrdmlah    v0.2s, v1.2s, v2.s[1]
-; CHECK-V81a-apple: sqrdmlah.2s v0,    v1,    v2[1]
   ret <2 x i32> %retval
 }
 
 define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
 ; CHECK-LABEL: test_sqrdmlahq_lane_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmlah v0.4s, v1.4s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
   %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
-; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.s[0]
-; CHECK-V81a:       sqrdmlah    v0.4s, v1.4s, v2.s[0]
-; CHECK-V81a-apple: sqrdmlah.4s v0,    v1,    v2[0]
   ret <4 x i32> %retval
 }
 
 define <4 x i16> @test_sqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
 ; CHECK-LABEL: test_sqrdmlsh_lane_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sqrdmlsh v0.4h, v1.4h, v2.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
   %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
-; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.h[3]
-; CHECK-V81a:       sqrdmlsh    v0.4h, v1.4h, v2.h[3]
-; CHECK-V81a-apple: sqrdmlsh.4h v0,    v1,    v2[3]
   ret <4 x i16> %retval
 }
 
 define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
 ; CHECK-LABEL: test_sqrdmlshq_lane_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmlsh v0.8h, v1.8h, v2.h[2]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
   %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
-; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.h[2]
-; CHECK-V81a:       sqrdmlsh    v0.8h, v1.8h, v2.h[2]
-; CHECK-V81a-apple: sqrdmlsh.8h v0,    v1,    v2[2]
   ret <8 x i16> %retval
 }
 
 define <2 x i32> @test_sqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
 ; CHECK-LABEL: test_sqrdmlsh_lane_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sqrdmlsh v0.2s, v1.2s, v2.s[1]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
   %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
-; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.s[1]
-; CHECK-V81a:       sqrdmlsh    v0.2s, v1.2s, v2.s[1]
-; CHECK-V81a-apple: sqrdmlsh.2s v0,    v1,    v2[1]
   ret <2 x i32> %retval
 }
 
 define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
 ; CHECK-LABEL: test_sqrdmlshq_lane_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmlsh v0.4s, v1.4s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
   %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
-; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.s[0]
-; CHECK-V81a:       sqrdmlsh    v0.4s, v1.4s, v2.s[0]
-; CHECK-V81a-apple: sqrdmlsh.4s v0,    v1,    v2[0]
   ret <4 x i32> %retval
 }
 
@@ -217,109 +215,129 @@ entry:
 
 define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
 ; CHECK-LABEL: test_sqrdmlah_extracted_lane_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqrdmlah v2.4h, v0.4h, v1.h[1]
+; CHECK-NEXT:    umov w0, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
   %retval = extractelement <4 x i16> %retval_vec, i64 0
-; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, v0.4h, v1.h[1]
-; CHECK-V81a:       sqrdmlah    {{v[2-9]+}}.4h, v0.4h, v1.h[1]
-; CHECK-V81a-apple: sqrdmlah.4h {{v[2-9]+}},    v0,    v1[1]
   ret i16 %retval
 }
 
 define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
 ; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    sqrdmlah v2.8h, v0.8h, v1.h[1]
+; CHECK-NEXT:    umov w0, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
   %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
   %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
   %retval = extractelement <8 x i16> %retval_vec, i64 0
-; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, v0.8h, v1.h[1]
-; CHECK-V81a:       sqrdmlah    {{v[2-9]+}}.8h, v0.8h, v1.h[1]
-; CHECK-V81a-apple: sqrdmlah.8h {{v[2-9]+}},    v0,    v1[1]
   ret i16 %retval
 }
 
 define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
 ; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqrdmlah v2.2s, v0.2s, v1.s[0]
+; CHECK-NEXT:    fmov w0, s2
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
   %extract = extractelement <2 x i32> %prod, i64 0
   %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
-; CHECK-V8a:        sqrdmulh    v0.2s, v0.2s, v1.s[0]
-; CHECK-V81a:       sqrdmlah    v2.2s, v0.2s, v1.s[0]
-; CHECK-V81a-apple: sqrdmlah.2s v2,    v0,    v1[0]
   ret i32 %retval
 }
 
 define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
 ; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    sqrdmlah v2.4s, v0.4s, v1.s[0]
+; CHECK-NEXT:    fmov w0, s2
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
   %extract = extractelement <4 x i32> %prod, i64 0
   %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
-; CHECK-V8a:        sqrdmulh    v0.4s, v0.4s, v1.s[0]
-; CHECK-V81a:       sqrdmlah    v2.4s, v0.4s, v1.s[0]
-; CHECK-V81a-apple: sqrdmlah.4s v2,    v0,    v1[0]
   ret i32 %retval
 }
 
 define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
 ; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqrdmlsh v2.4h, v0.4h, v1.h[1]
+; CHECK-NEXT:    umov w0, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
   %retval = extractelement <4 x i16> %retval_vec, i64 0
-; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, v0.4h, v1.h[1]
-; CHECK-V81a:       sqrdmlsh    {{v[2-9]+}}.4h, v0.4h, v1.h[1]
-; CHECK-V81a-apple: sqrdmlsh.4h {{v[2-9]+}},    v0,    v1[1]
   ret i16 %retval
 }
 
 define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
 ; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    sqrdmlsh v2.8h, v0.8h, v1.h[1]
+; CHECK-NEXT:    umov w0, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
   %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
   %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
   %retval = extractelement <8 x i16> %retval_vec, i64 0
-; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, v0.8h, v1.h[1]
-; CHECK-V81a:       sqrdmlsh    {{v[2-9]+}}.8h, v0.8h, v1.h[1]
-; CHECK-V81a-apple: sqrdmlsh.8h {{v[2-9]+}},    v0,    v1[1]
   ret i16 %retval
 }
 
 define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
 ; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqrdmlsh v2.2s, v0.2s, v1.s[0]
+; CHECK-NEXT:    fmov w0, s2
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
   %extract = extractelement <2 x i32> %prod, i64 0
   %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
-; CHECK-V8a:        sqrdmulh    v0.2s, v0.2s, v1.s[0]
-; CHECK-V81a:       sqrdmlsh    v2.2s, v0.2s, v1.s[0]
-; CHECK-V81a-apple: sqrdmlsh.2s v2,    v0,    v1[0]
   ret i32 %retval
 }
 
 define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
 ; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    sqrdmlsh v2.4s, v0.4s, v1.s[0]
+; CHECK-NEXT:    fmov w0, s2
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
   %extract = extractelement <4 x i32> %prod, i64 0
   %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
-; CHECK-V8a:        sqrdmulh    v0.4s, v0.4s, v1.s[0]
-; CHECK-V81a:       sqrdmlsh    v2.4s, v0.4s, v1.s[0]
-; CHECK-V81a-apple: sqrdmlsh.4s v2,    v0,    v1[0]
   ret i32 %retval
 }
 
@@ -329,77 +347,102 @@ entry:
 
 define i16 @test_sqrdmlah_v1i16(i16 %acc, i16 %x, i16 %y) {
 ; CHECK-LABEL: test_sqrdmlah_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w1
+; CHECK-NEXT:    fmov s1, w2
+; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    sqrdmlah v2.4h, v0.4h, v1.4h
+; CHECK-NEXT:    umov w0, v2.h[0]
+; CHECK-NEXT:    ret
   %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
   %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
   %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec,  <4 x i16> %y_vec)
   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec,  <4 x i16> %prod_vec)
   %retval = extractelement <4 x i16> %retval_vec, i64 0
-; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
   ret i16 %retval
 }
 
 define i32 @test_sqrdmlah_v1i32(i32 %acc, i32 %x, i32 %y) {
 ; CHECK-LABEL: test_sqrdmlah_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w1
+; CHECK-NEXT:    fmov s1, w2
+; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    sqrdmlah v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fmov w0, s2
+; CHECK-NEXT:    ret
   %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
   %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
   %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec,  <4 x i32> %y_vec)
   %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
   %retval_vec =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc_vec,  <4 x i32> %prod_vec)
   %retval = extractelement <4 x i32> %retval_vec, i64 0
-; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-; CHECK-V81a-apple: sqrdmlah.4s {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
   ret i32 %retval
 }
 
 
 define i16 @test_sqrdmlsh_v1i16(i16 %acc, i16 %x, i16 %y) {
 ; CHECK-LABEL: test_sqrdmlsh_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w1
+; CHECK-NEXT:    fmov s1, w2
+; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    sqrdmlsh v2.4h, v0.4h, v1.4h
+; CHECK-NEXT:    umov w0, v2.h[0]
+; CHECK-NEXT:    ret
   %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
   %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
   %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec,  <4 x i16> %y_vec)
   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec,  <4 x i16> %prod_vec)
   %retval = extractelement <4 x i16> %retval_vec, i64 0
-; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-; CHECK-V81a-apple: sqrdmlsh.4h {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
   ret i16 %retval
 }
 
 define i32 @test_sqrdmlsh_v1i32(i32 %acc, i32 %x, i32 %y) {
 ; CHECK-LABEL: test_sqrdmlsh_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w1
+; CHECK-NEXT:    fmov s1, w2
+; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    sqrdmlsh v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fmov w0, s2
+; CHECK-NEXT:    ret
   %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
   %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
   %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec,  <4 x i32> %y_vec)
   %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
   %retval_vec =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc_vec,  <4 x i32> %prod_vec)
   %retval = extractelement <4 x i32> %retval_vec, i64 0
-; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-; CHECK-V81a-apple: sqrdmlsh.4s {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
   ret i32 %retval
 }
+
 define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) {
 ; CHECK-LABEL: test_sqrdmlah_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w1
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s2, w2
+; CHECK-NEXT:    sqrdmlah s1, s0, s2
+; CHECK-NEXT:    fmov w0, s1
+; CHECK-NEXT:    ret
   %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
   %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
-; CHECK-V8a:        sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-; CHECK-V81a:       sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-; CHECK-V81a-apple: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   ret i32 %retval
 }
 
 define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) {
 ; CHECK-LABEL: test_sqrdmlsh_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w1
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s2, w2
+; CHECK-NEXT:    sqrdmlsh s1, s0, s2
+; CHECK-NEXT:    fmov w0, s1
+; CHECK-NEXT:    ret
   %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
   %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
-; CHECK-V8a:        sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-; CHECK-V81a:       sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-; CHECK-V81a-apple: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   ret i32 %retval
 }
 
@@ -410,50 +453,63 @@ define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) {
 
 define i16 @test_sqrdmlah_extract_i16(i16 %acc, i16 %x, <4 x i16> %y_vec) {
 ; CHECK-LABEL: test_sqrdmlah_extract_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s1, w1
+; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    sqrdmlah v2.4h, v1.4h, v0.h[1]
+; CHECK-NEXT:    umov w0, v2.h[0]
+; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x i16> %y_vec, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
   %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %shuffle)
   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
   %retval = extractelement <4 x i16> %retval_vec, i32 0
-; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
-; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
-; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}},    {{v[0-9]+}}, v0[1]
   ret i16 %retval
 }
 
 define i32 @test_sqrdmlah_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
 ; CHECK-LABEL: test_sqrdmlah_extract_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s1, w1
+; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    sqrdmlah s2, s1, v0.s[3]
+; CHECK-NEXT:    fmov w0, s2
+; CHECK-NEXT:    ret
   %extract = extractelement <4 x i32> %rhs, i32 3
   %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
   %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
-; CHECK-V8a:        sqrdmulh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
-; CHECK-V81a:       sqrdmlah   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
-; CHECK-V81a-apple: sqrdmlah.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
   ret i32 %retval
 }
 
 define i16 @test_sqrdmlshq_extract_i16(i16 %acc, i16 %x, <8 x i16> %y_vec) {
 ; CHECK-LABEL: test_sqrdmlshq_extract_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s1, w1
+; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    sqrdmlsh v2.8h, v1.8h, v0.h[1]
+; CHECK-NEXT:    umov w0, v2.h[0]
+; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %y_vec, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1>
   %x_vec = insertelement <8 x i16> undef, i16 %x, i64 0
   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x_vec, <8 x i16> %shuffle)
   %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
   %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
   %retval = extractelement <8 x i16> %retval_vec, i32 0
-; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
-; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
-; CHECK-V81a-apple: sqrdmlsh.8h {{v[0-9]+}},    {{v[0-9]+}}, v0[1]
   ret i16 %retval
 }
 
 define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
 ; CHECK-LABEL: test_sqrdmlsh_extract_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s1, w1
+; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    sqrdmlsh s2, s1, v0.s[3]
+; CHECK-NEXT:    fmov w0, s2
+; CHECK-NEXT:    ret
   %extract = extractelement <4 x i32> %rhs, i32 3
   %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
   %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
-; CHECK-V8a:        sqrdmulh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
-; CHECK-V81a:       sqrdmlsh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
-; CHECK-V81a-apple: sqrdmlsh.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
   ret i32 %retval
 }

diff  --git a/llvm/test/CodeGen/ARM/neon-v8.1a.ll b/llvm/test/CodeGen/ARM/neon-v8.1a.ll
index 95d2085800810..ca6c24d905e2c 100644
--- a/llvm/test/CodeGen/ARM/neon-v8.1a.ll
+++ b/llvm/test/CodeGen/ARM/neon-v8.1a.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=armv8 -mattr=+v8.1a | FileCheck %s
 
 ;-----------------------------------------------------------------------------
@@ -18,149 +19,185 @@ declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>)
 declare <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32>, <2 x i32>)
 declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>)
 
-define <4 x i16> @test_vqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
-; CHECK-LABEL: test_vqrdmlah_v4i16:
+define arm_aapcs_vfpcc <4 x i16> @test_vqrdmulah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
+; CHECK-LABEL: test_vqrdmulah_v4i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vqrdmlah.s16 d0, d1, d2
+; CHECK-NEXT:    bx lr
    %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
    %retval =  call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %acc,  <4 x i16> %prod)
-; CHECK: vqrdmlah.s16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
    ret <4 x i16> %retval
 }
 
-define <8 x i16> @test_vqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
-; CHECK-LABEL: test_vqrdmlah_v8i16:
+define arm_aapcs_vfpcc <8 x i16> @test_vqrdmulah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
+; CHECK-LABEL: test_vqrdmulah_v8i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vqrdmlah.s16 q0, q1, q2
+; CHECK-NEXT:    bx lr
    %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
    %retval =  call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %acc, <8 x i16> %prod)
-; CHECK: vqrdmlah.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
    ret <8 x i16> %retval
 }
 
-define <2 x i32> @test_vqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
-; CHECK-LABEL: test_vqrdmlah_v2i32:
+define arm_aapcs_vfpcc <2 x i32> @test_vqrdmulah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
+; CHECK-LABEL: test_vqrdmulah_v2i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vqrdmlah.s32 d0, d1, d2
+; CHECK-NEXT:    bx lr
    %prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
    %retval =  call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %acc, <2 x i32> %prod)
-; CHECK: vqrdmlah.s32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
    ret <2 x i32> %retval
 }
 
-define <4 x i32> @test_vqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
-; CHECK-LABEL: test_vqrdmlah_v4i32:
+define arm_aapcs_vfpcc <4 x i32> @test_vqrdmulah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_vqrdmulah_v4i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vqrdmlah.s32 q0, q1, q2
+; CHECK-NEXT:    bx lr
    %prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
    %retval =  call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %acc, <4 x i32> %prod)
-; CHECK: vqrdmlah.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
    ret <4 x i32> %retval
 }
 
-define <4 x i16> @test_vqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
-; CHECK-LABEL: test_vqrdmlsh_v4i16:
+define arm_aapcs_vfpcc <4 x i16> @test_vqrdmulsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
+; CHECK-LABEL: test_vqrdmulsh_v4i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vqrdmlsh.s16 d0, d1, d2
+; CHECK-NEXT:    bx lr
    %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
    %retval =  call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %acc, <4 x i16> %prod)
-; CHECK: vqrdmlsh.s16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
    ret <4 x i16> %retval
 }
 
-define <8 x i16> @test_vqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
-; CHECK-LABEL: test_vqrdmlsh_v8i16:
+define arm_aapcs_vfpcc <8 x i16> @test_vqrdmulsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
+; CHECK-LABEL: test_vqrdmulsh_v8i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vqrdmlsh.s16 q0, q1, q2
+; CHECK-NEXT:    bx lr
    %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
    %retval =  call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %acc, <8 x i16> %prod)
-; CHECK: vqrdmlsh.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
    ret <8 x i16> %retval
 }
 
-define <2 x i32> @test_vqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
-; CHECK-LABEL: test_vqrdmlsh_v2i32:
+define arm_aapcs_vfpcc <2 x i32> @test_vqrdmulsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
+; CHECK-LABEL: test_vqrdmulsh_v2i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vqrdmlsh.s32 d0, d1, d2
+; CHECK-NEXT:    bx lr
    %prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
    %retval =  call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %acc, <2 x i32> %prod)
-; CHECK: vqrdmlsh.s32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
    ret <2 x i32> %retval
 }
 
-define <4 x i32> @test_vqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
-; CHECK-LABEL: test_vqrdmlsh_v4i32:
+define arm_aapcs_vfpcc <4 x i32> @test_vqrdmulsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_vqrdmulsh_v4i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vqrdmlsh.s32 q0, q1, q2
+; CHECK-NEXT:    bx lr
    %prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
    %retval =  call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %acc, <4 x i32> %prod)
-; CHECK: vqrdmlsh.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
    ret <4 x i32> %retval
 }
 
 ;-----------------------------------------------------------------------------
 ; RDMA Scalar
 
-define <4 x i16> @test_vqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
-; CHECK-LABEL: test_vqrdmlah_lane_s16:
+define arm_aapcs_vfpcc <4 x i16> @test_vqrdmulah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulah_lane_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqrdmlah.s16 d0, d1, d2[3]
+; CHECK-NEXT:    bx lr
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
   %retval =  call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %acc, <4 x i16> %prod)
-; CHECK: vqrdmlah.s16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}[3]
   ret <4 x i16> %retval
 }
 
-define <8 x i16> @test_vqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <4 x i16> %v) {
-; CHECK-LABEL: test_vqrdmlahq_lane_s16:
+define arm_aapcs_vfpcc <8 x i16> @test_vqrdmulahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulahq_lane_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    @ kill: def $d4 killed $d4 def $q2
+; CHECK-NEXT:    vqrdmlah.s16 q0, q1, d4[2]
+; CHECK-NEXT:    bx lr
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
   %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
   %retval =  call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %acc, <8 x i16> %prod)
-; CHECK: vqrdmlah.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{d[0-9]+}}[2]
   ret <8 x i16> %retval
 }
 
-define <2 x i32> @test_vqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
-; CHECK-LABEL: test_vqrdmlah_lane_s32:
+define arm_aapcs_vfpcc <2 x i32> @test_vqrdmulah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulah_lane_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqrdmlah.s32 d0, d1, d2[1]
+; CHECK-NEXT:    bx lr
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %prod = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
   %retval =  call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %acc, <2 x i32> %prod)
-; CHECK: vqrdmlah.s32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}[1]
   ret <2 x i32> %retval
 }
 
-define <4 x i32> @test_vqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <2 x i32> %v) {
-; CHECK-LABEL: test_vqrdmlahq_lane_s32:
+define arm_aapcs_vfpcc <4 x i32> @test_vqrdmulahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulahq_lane_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    @ kill: def $d4 killed $d4 def $q2
+; CHECK-NEXT:    vqrdmlah.s32 q0, q1, d4[0]
+; CHECK-NEXT:    bx lr
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   %prod = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
   %retval =  call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %acc, <4 x i32> %prod)
-; CHECK: vqrdmlah.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{d[0-9]+}}[0]
   ret <4 x i32> %retval
 }
 
-define <4 x i16> @test_vqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
-; CHECK-LABEL: test_vqrdmlsh_lane_s16:
+define arm_aapcs_vfpcc <4 x i16> @test_vqrdmulsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulsh_lane_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqrdmlsh.s16 d0, d1, d2[3]
+; CHECK-NEXT:    bx lr
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
   %retval =  call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %acc, <4 x i16> %prod)
-; CHECK: vqrdmlsh.s16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}[3]
   ret <4 x i16> %retval
 }
 
-define <8 x i16> @test_vqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <4 x i16> %v) {
-; CHECK-LABEL: test_vqrdmlshq_lane_s16:
+define arm_aapcs_vfpcc <8 x i16> @test_vqrdmulshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulshq_lane_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    @ kill: def $d4 killed $d4 def $q2
+; CHECK-NEXT:    vqrdmlsh.s16 q0, q1, d4[2]
+; CHECK-NEXT:    bx lr
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
   %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
   %retval =  call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %acc, <8 x i16> %prod)
-; CHECK: vqrdmlsh.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{d[0-9]+}}[2]
   ret <8 x i16> %retval
 }
 
-define <2 x i32> @test_vqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
-; CHECK-LABEL: test_vqrdmlsh_lane_s32:
+define arm_aapcs_vfpcc <2 x i32> @test_vqrdmulsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulsh_lane_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqrdmlsh.s32 d0, d1, d2[1]
+; CHECK-NEXT:    bx lr
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %prod = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
   %retval =  call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %acc, <2 x i32> %prod)
-; CHECK: vqrdmlsh.s32  {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}[1]
   ret <2 x i32> %retval
 }
 
-define <4 x i32> @test_vqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <2 x i32> %v) {
-; CHECK-LABEL: test_vqrdmlshq_lane_s32:
+define arm_aapcs_vfpcc <4 x i32> @test_vqrdmulshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulshq_lane_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    @ kill: def $d4 killed $d4 def $q2
+; CHECK-NEXT:    vqrdmlsh.s32 q0, q1, d4[0]
+; CHECK-NEXT:    bx lr
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   %prod = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
   %retval =  call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %acc, <4 x i32> %prod)
-; CHECK: vqrdmlsh.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{d[0-9]+}}[0]
   ret <4 x i32> %retval
 }