[clang] 301011f - Rerun ./utils/update_cc_test.py on a bunch of tests

Nicolai Hähnle via cfe-commits cfe-commits at lists.llvm.org
Mon Jul 18 23:53:13 PDT 2022


Author: Nicolai Hähnle
Date: 2022-07-19T08:53:05+02:00
New Revision: 301011fa6078b4f16bd3fc6158d9c6fddad7e118

URL: https://github.com/llvm/llvm-project/commit/301011fa6078b4f16bd3fc6158d9c6fddad7e118
DIFF: https://github.com/llvm/llvm-project/commit/301011fa6078b4f16bd3fc6158d9c6fddad7e118.diff

LOG: Rerun ./utils/update_cc_test.py on a bunch of tests

Due to update script changes; this reduces the size of a later
"real" diff.

Added: 
    

Modified: 
    clang/test/CodeGen/PowerPC/builtins-ppc-quadword-noi128.c
    clang/test/CodeGen/aarch64-neon-2velem.c
    clang/test/CodeGen/aarch64-neon-fp16fml.c
    clang/test/CodeGen/arm_acle.c
    clang/test/CodeGen/memcpy-inline-builtin.c
    clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp
    clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
    clang/test/OpenMP/sections_reduction_task_codegen.cpp
    clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
    clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
    clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp

Removed: 
    


################################################################################
diff  --git a/clang/test/CodeGen/PowerPC/builtins-ppc-quadword-noi128.c b/clang/test/CodeGen/PowerPC/builtins-ppc-quadword-noi128.c
index 89e4da9878e5..11fb7bc5e7a7 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-quadword-noi128.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-quadword-noi128.c
@@ -78,20 +78,20 @@ vector unsigned char test_sube(vector unsigned char a, vector unsigned char b,
 // CHECK-LE-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
 // CHECK-LE-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[C:%.*]] to <1 x i128>
 // CHECK-LE-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
-// CHECK-LE-NEXT:    [[VADDUQM_I_NEG:%.*]] = add <1 x i128> [[TMP0]], [[TMP1]]
-// CHECK-LE-NEXT:    [[VSUBUQM_I:%.*]] = sub <1 x i128> [[TMP2]], [[VADDUQM_I_NEG]]
-// CHECK-LE-NEXT:    [[TMP3:%.*]] = bitcast <1 x i128> [[VSUBUQM_I]] to <16 x i8>
-// CHECK-LE-NEXT:    ret <16 x i8> [[TMP3]]
+// CHECK-LE-NEXT:    [[TMP3:%.*]] = add <1 x i128> [[TMP0]], [[TMP1]]
+// CHECK-LE-NEXT:    [[VSUBUQM_I:%.*]] = sub <1 x i128> [[TMP2]], [[TMP3]]
+// CHECK-LE-NEXT:    [[TMP4:%.*]] = bitcast <1 x i128> [[VSUBUQM_I]] to <16 x i8>
+// CHECK-LE-NEXT:    ret <16 x i8> [[TMP4]]
 //
 // CHECK-AIX-LABEL: @test_sub(
 // CHECK-AIX-NEXT:  entry:
 // CHECK-AIX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
 // CHECK-AIX-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[C:%.*]] to <1 x i128>
 // CHECK-AIX-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
-// CHECK-AIX-NEXT:    [[VADDUQM_I_NEG:%.*]] = add <1 x i128> [[TMP0]], [[TMP1]]
-// CHECK-AIX-NEXT:    [[VSUBUQM_I:%.*]] = sub <1 x i128> [[TMP2]], [[VADDUQM_I_NEG]]
-// CHECK-AIX-NEXT:    [[TMP3:%.*]] = bitcast <1 x i128> [[VSUBUQM_I]] to <16 x i8>
-// CHECK-AIX-NEXT:    ret <16 x i8> [[TMP3]]
+// CHECK-AIX-NEXT:    [[TMP3:%.*]] = add <1 x i128> [[TMP0]], [[TMP1]]
+// CHECK-AIX-NEXT:    [[VSUBUQM_I:%.*]] = sub <1 x i128> [[TMP2]], [[TMP3]]
+// CHECK-AIX-NEXT:    [[TMP4:%.*]] = bitcast <1 x i128> [[VSUBUQM_I]] to <16 x i8>
+// CHECK-AIX-NEXT:    ret <16 x i8> [[TMP4]]
 //
 vector unsigned char test_sub(vector unsigned char a, vector unsigned char b,
                               vector unsigned char c) {

diff  --git a/clang/test/CodeGen/aarch64-neon-2velem.c b/clang/test/CodeGen/aarch64-neon-2velem.c
index 2a7275c9b9a2..21a8c4cb6611 100644
--- a/clang/test/CodeGen/aarch64-neon-2velem.c
+++ b/clang/test/CodeGen/aarch64-neon-2velem.c
@@ -653,7 +653,7 @@ float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4:[0-9]+]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -668,7 +668,7 @@ int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -683,7 +683,7 @@ int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -698,7 +698,7 @@ int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -714,7 +714,7 @@ int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -730,7 +730,7 @@ int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -746,7 +746,7 @@ int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -762,7 +762,7 @@ int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -777,7 +777,7 @@ int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -792,7 +792,7 @@ int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -807,7 +807,7 @@ int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -822,7 +822,7 @@ int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -838,7 +838,7 @@ int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -854,7 +854,7 @@ int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -870,7 +870,7 @@ int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -886,7 +886,7 @@ int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -901,7 +901,7 @@ int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -916,7 +916,7 @@ int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -931,7 +931,7 @@ int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -946,7 +946,7 @@ int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -962,7 +962,7 @@ int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -978,7 +978,7 @@ int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -994,7 +994,7 @@ int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -1010,7 +1010,7 @@ int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -1025,7 +1025,7 @@ int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -1040,7 +1040,7 @@ int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -1055,7 +1055,7 @@ int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -1070,7 +1070,7 @@ int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -1086,7 +1086,7 @@ int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -1102,7 +1102,7 @@ int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -1118,7 +1118,7 @@ int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -1134,7 +1134,7 @@ int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -1149,7 +1149,7 @@ int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
@@ -1163,7 +1163,7 @@ int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
@@ -1177,7 +1177,7 @@ int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
@@ -1191,7 +1191,7 @@ uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
@@ -1206,7 +1206,7 @@ uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
@@ -1221,7 +1221,7 @@ int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
@@ -1236,7 +1236,7 @@ int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
@@ -1251,7 +1251,7 @@ uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
@@ -1265,7 +1265,7 @@ uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
@@ -1279,7 +1279,7 @@ int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
@@ -1293,7 +1293,7 @@ int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
@@ -1307,7 +1307,7 @@ uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
@@ -1322,7 +1322,7 @@ uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
@@ -1337,7 +1337,7 @@ int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
@@ -1352,7 +1352,7 @@ int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
@@ -1367,7 +1367,7 @@ uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
@@ -1382,8 +1382,8 @@ uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
@@ -1398,8 +1398,8 @@ int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
@@ -1415,8 +1415,8 @@ int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
 int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
@@ -1432,8 +1432,8 @@ int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
 int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
@@ -1448,8 +1448,8 @@ int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
@@ -1464,8 +1464,8 @@ int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
@@ -1481,8 +1481,8 @@ int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
 int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
@@ -1498,8 +1498,8 @@ int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
 int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
@@ -1513,7 +1513,7 @@ int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
@@ -1528,7 +1528,7 @@ int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
@@ -1543,7 +1543,7 @@ int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
@@ -1558,7 +1558,7 @@ int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
@@ -1574,7 +1574,7 @@ int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
@@ -1590,7 +1590,7 @@ int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
@@ -1606,7 +1606,7 @@ int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
@@ -1622,7 +1622,7 @@ int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
@@ -1844,7 +1844,7 @@ float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
 //
 float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
@@ -1858,7 +1858,7 @@ float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
 //
 float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
@@ -1872,7 +1872,7 @@ float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
 //
 float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
@@ -1886,7 +1886,7 @@ float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
 //
 float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
@@ -1900,7 +1900,7 @@ float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
 //
 float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
@@ -1914,7 +1914,7 @@ float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
 //
 float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
@@ -2493,7 +2493,7 @@ float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v)
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -2508,7 +2508,7 @@ int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -2523,7 +2523,7 @@ int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -2538,7 +2538,7 @@ int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -2554,7 +2554,7 @@ int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -2570,7 +2570,7 @@ int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -2586,7 +2586,7 @@ int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -2602,7 +2602,7 @@ int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -2617,7 +2617,7 @@ int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -2632,7 +2632,7 @@ int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -2647,7 +2647,7 @@ int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -2662,7 +2662,7 @@ int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -2678,7 +2678,7 @@ int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -2694,7 +2694,7 @@ int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -2710,7 +2710,7 @@ int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -2726,7 +2726,7 @@ int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -2741,7 +2741,7 @@ int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -2756,7 +2756,7 @@ int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -2771,7 +2771,7 @@ int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -2786,7 +2786,7 @@ int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -2802,7 +2802,7 @@ int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -2818,7 +2818,7 @@ int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -2834,7 +2834,7 @@ int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -2850,7 +2850,7 @@ int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -2865,7 +2865,7 @@ int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -2880,7 +2880,7 @@ int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -2895,7 +2895,7 @@ int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -2910,7 +2910,7 @@ int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -2926,7 +2926,7 @@ int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -2942,7 +2942,7 @@ int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -2958,7 +2958,7 @@ int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -2974,7 +2974,7 @@ int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -2989,7 +2989,7 @@ int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
@@ -3003,7 +3003,7 @@ int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
@@ -3017,7 +3017,7 @@ int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
@@ -3031,7 +3031,7 @@ uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
@@ -3046,7 +3046,7 @@ uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
@@ -3061,7 +3061,7 @@ int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
@@ -3076,7 +3076,7 @@ int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
@@ -3091,7 +3091,7 @@ uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
@@ -3105,7 +3105,7 @@ uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
@@ -3119,7 +3119,7 @@ int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
@@ -3133,7 +3133,7 @@ int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
@@ -3147,7 +3147,7 @@ uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
@@ -3162,7 +3162,7 @@ uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
@@ -3177,7 +3177,7 @@ int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
@@ -3192,7 +3192,7 @@ int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
@@ -3207,7 +3207,7 @@ uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
@@ -3222,8 +3222,8 @@ uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
 int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
@@ -3238,8 +3238,8 @@ int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
 int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
@@ -3255,8 +3255,8 @@ int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
 int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
@@ -3272,8 +3272,8 @@ int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
 int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
@@ -3288,8 +3288,8 @@ int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
 int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
@@ -3304,8 +3304,8 @@ int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
 int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
@@ -3321,8 +3321,8 @@ int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
 int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
@@ -3338,8 +3338,8 @@ int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
 int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
@@ -3353,7 +3353,7 @@ int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
@@ -3368,7 +3368,7 @@ int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
@@ -3383,7 +3383,7 @@ int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
@@ -3398,7 +3398,7 @@ int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
@@ -3414,7 +3414,7 @@ int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
@@ -3430,7 +3430,7 @@ int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
@@ -3446,7 +3446,7 @@ int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
@@ -3462,7 +3462,7 @@ int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
@@ -3656,7 +3656,7 @@ float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
 //
 float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
@@ -3670,7 +3670,7 @@ float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
 //
 float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
@@ -3684,7 +3684,7 @@ float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
 //
 float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
@@ -3698,7 +3698,7 @@ float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
 //
 float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
@@ -3712,7 +3712,7 @@ float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
 //
 float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
@@ -3726,7 +3726,7 @@ float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
 //
 float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
@@ -3736,14 +3736,14 @@ float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
 // CHECK-LABEL: @test_vmull_high_n_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[B]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
-// CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
 //
 int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
   return vmull_high_n_s16(a, b);
@@ -3752,12 +3752,12 @@ int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
 // CHECK-LABEL: @test_vmull_high_n_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
-// CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
 //
 int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
   return vmull_high_n_s32(a, b);
@@ -3766,14 +3766,14 @@ int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
 // CHECK-LABEL: @test_vmull_high_n_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[B]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
-// CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
 //
 uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
   return vmull_high_n_u16(a, b);
@@ -3782,12 +3782,12 @@ uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
 // CHECK-LABEL: @test_vmull_high_n_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
-// CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
 //
 uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
   return vmull_high_n_u32(a, b);
@@ -3796,15 +3796,15 @@ uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
 // CHECK-LABEL: @test_vqdmull_high_n_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[B]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
-// CHECK-NEXT:    [[VQDMULL_V3_I_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I_I]] to <16 x i8>
-// CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I_I_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I_I]]
 //
 int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
   return vqdmull_high_n_s16(a, b);
@@ -3813,13 +3813,13 @@ int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
 // CHECK-LABEL: @test_vqdmull_high_n_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
-// CHECK-NEXT:    [[VQDMULL_V3_I_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I_I]] to <16 x i8>
-// CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I_I_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8>
+// CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I_I]]
 //
 int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
   return vqdmull_high_n_s32(a, b);
@@ -3828,15 +3828,15 @@ int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
 // CHECK-LABEL: @test_vmlal_high_n_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
-// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I_I]]
-// CHECK-NEXT:    ret <4 x i32> [[ADD_I_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
 //
 int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
   return vmlal_high_n_s16(a, b, c);
@@ -3845,13 +3845,13 @@ int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
 // CHECK-LABEL: @test_vmlal_high_n_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
-// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I_I]]
-// CHECK-NEXT:    ret <2 x i64> [[ADD_I_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
 //
 int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
   return vmlal_high_n_s32(a, b, c);
@@ -3860,15 +3860,15 @@ int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
 // CHECK-LABEL: @test_vmlal_high_n_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
-// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I_I]]
-// CHECK-NEXT:    ret <4 x i32> [[ADD_I_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
 //
 uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
   return vmlal_high_n_u16(a, b, c);
@@ -3877,13 +3877,13 @@ uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
 // CHECK-LABEL: @test_vmlal_high_n_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
-// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I_I]]
-// CHECK-NEXT:    ret <2 x i64> [[ADD_I_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
 //
 uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
   return vmlal_high_n_u32(a, b, c);
@@ -3892,16 +3892,16 @@ uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
 // CHECK-LABEL: @test_vqdmlal_high_n_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
-// CHECK-NEXT:    [[VQDMLAL_V3_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I_I]]) #4
-// CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I_I_I]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I_I]]
 //
 int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
   return vqdmlal_high_n_s16(a, b, c);
@@ -3910,14 +3910,14 @@ int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
 // CHECK-LABEL: @test_vqdmlal_high_n_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
-// CHECK-NEXT:    [[VQDMLAL_V3_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I_I]]) #4
-// CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I_I_I]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I_I]]
 //
 int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
   return vqdmlal_high_n_s32(a, b, c);
@@ -3926,15 +3926,15 @@ int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
 // CHECK-LABEL: @test_vmlsl_high_n_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
-// CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I_I]]
-// CHECK-NEXT:    ret <4 x i32> [[SUB_I_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
 //
 int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
   return vmlsl_high_n_s16(a, b, c);
@@ -3943,13 +3943,13 @@ int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
 // CHECK-LABEL: @test_vmlsl_high_n_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
-// CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I_I]]
-// CHECK-NEXT:    ret <2 x i64> [[SUB_I_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
 //
 int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
   return vmlsl_high_n_s32(a, b, c);
@@ -3958,15 +3958,15 @@ int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
 // CHECK-LABEL: @test_vmlsl_high_n_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
-// CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I_I]]
-// CHECK-NEXT:    ret <4 x i32> [[SUB_I_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
 //
 uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
   return vmlsl_high_n_u16(a, b, c);
@@ -3975,13 +3975,13 @@ uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
 // CHECK-LABEL: @test_vmlsl_high_n_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
-// CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I_I]]
-// CHECK-NEXT:    ret <2 x i64> [[SUB_I_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
 //
 uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
   return vmlsl_high_n_u32(a, b, c);
@@ -3990,16 +3990,16 @@ uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
 // CHECK-LABEL: @test_vqdmlsl_high_n_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
-// CHECK-NEXT:    [[VQDMLSL_V3_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I_I]]) #4
-// CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I_I_I]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I_I]]
 //
 int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
   return vqdmlsl_high_n_s16(a, b, c);
@@ -4008,14 +4008,14 @@ int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
 // CHECK-LABEL: @test_vqdmlsl_high_n_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
-// CHECK-NEXT:    [[VQDMLSL_V3_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I_I]]) #4
-// CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I_I_I]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I_I]]
 //
 int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
   return vqdmlsl_high_n_s32(a, b, c);
@@ -4063,7 +4063,7 @@ float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) #4
+// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x float> [[TMP3]]
 //
 float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
@@ -4076,7 +4076,7 @@ float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[VECINIT_I]], <1 x double> [[A]]) #4
+// CHECK-NEXT:    [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[VECINIT_I]], <1 x double> [[A]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <1 x double> [[TMP3]]
 //
 float64x1_t test_vfma_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
@@ -4092,7 +4092,7 @@ float64x1_t test_vfma_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) #4
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x float> [[TMP3]]
 //
 float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
@@ -4107,7 +4107,7 @@ float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) #4
+// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x float> [[TMP3]]
 //
 float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
@@ -4121,7 +4121,7 @@ float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[FNEG_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[VECINIT_I]], <1 x double> [[A]]) #4
+// CHECK-NEXT:    [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[VECINIT_I]], <1 x double> [[A]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <1 x double> [[TMP3]]
 //
 float64x1_t test_vfms_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
@@ -4138,7 +4138,7 @@ float64x1_t test_vfms_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG_I]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) #4
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x float> [[TMP3]]
 //
 float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
@@ -4261,7 +4261,7 @@ uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #4
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
 //
 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
@@ -4274,7 +4274,7 @@ int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #4
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
 //
 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
@@ -4289,7 +4289,7 @@ int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #4
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
 //
 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
@@ -4302,7 +4302,7 @@ uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #4
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
 //
 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
@@ -4317,7 +4317,7 @@ uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #4
+// CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
 // CHECK-NEXT:    [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I_I]]
 //
@@ -4331,7 +4331,7 @@ int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #4
+// CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
 // CHECK-NEXT:    [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I_I]]
 //
@@ -4347,9 +4347,9 @@ int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #4
-// CHECK-NEXT:    [[VQDMULH_V3_I_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I_I]] to <8 x i8>
-// CHECK-NEXT:    ret <4 x i16> [[VQDMULH_V2_I_I]]
+// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VQDMULH_V2_I]]
 //
 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
   return vqdmulh_n_s16(a, b);
@@ -4367,9 +4367,9 @@ int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
 // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_V2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]]) #4
-// CHECK-NEXT:    [[VQDMULHQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I_I]] to <16 x i8>
-// CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_V2_I_I]]
+// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_V2_I]]
 //
 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
   return vqdmulhq_n_s16(a, b);
@@ -4381,9 +4381,9 @@ int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #4
-// CHECK-NEXT:    [[VQDMULH_V3_I_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I_I]] to <8 x i8>
-// CHECK-NEXT:    ret <2 x i32> [[VQDMULH_V2_I_I]]
+// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VQDMULH_V2_I]]
 //
 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
   return vqdmulh_n_s32(a, b);
@@ -4397,9 +4397,9 @@ int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]]) #4
-// CHECK-NEXT:    [[VQDMULHQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I_I]] to <16 x i8>
-// CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_V2_I_I]]
+// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_V2_I]]
 //
 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
   return vqdmulhq_n_s32(a, b);
@@ -4413,9 +4413,9 @@ int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #4
-// CHECK-NEXT:    [[VQRDMULH_V3_I_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I_I]] to <8 x i8>
-// CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_V2_I_I]]
+// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_V2_I]]
 //
 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
   return vqrdmulh_n_s16(a, b);
@@ -4433,9 +4433,9 @@ int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
 // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_V2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]]) #4
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I_I]] to <16 x i8>
-// CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_V2_I_I]]
+// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_V2_I]]
 //
 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
   return vqrdmulhq_n_s16(a, b);
@@ -4447,9 +4447,9 @@ int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #4
-// CHECK-NEXT:    [[VQRDMULH_V3_I_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I_I]] to <8 x i8>
-// CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_V2_I_I]]
+// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_V2_I]]
 //
 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
   return vqrdmulh_n_s32(a, b);
@@ -4463,9 +4463,9 @@ int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]]) #4
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I_I]] to <16 x i8>
-// CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_V2_I_I]]
+// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_V2_I]]
 //
 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
   return vqrdmulhq_n_s32(a, b);
@@ -4595,7 +4595,7 @@ uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
 //
@@ -4609,7 +4609,7 @@ int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
 //
@@ -4625,7 +4625,7 @@ int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
 //
@@ -4639,7 +4639,7 @@ uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
 //
@@ -4656,8 +4656,8 @@ uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4
-// CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I_I]]
 //
 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
@@ -4671,8 +4671,8 @@ int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4
-// CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I_I]]
 //
 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
@@ -4803,7 +4803,7 @@ uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
 //
@@ -4817,7 +4817,7 @@ int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
 //
@@ -4833,7 +4833,7 @@ int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
 //
@@ -4847,7 +4847,7 @@ uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
 //
@@ -4864,8 +4864,8 @@ uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4
-// CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I_I]]
 //
 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
@@ -4879,8 +4879,8 @@ int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4
-// CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I_I]]
 //
 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
@@ -4999,8 +4999,8 @@ uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
 int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
@@ -5015,8 +5015,8 @@ int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
 int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
@@ -5032,8 +5032,8 @@ int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
 int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
@@ -5049,8 +5049,8 @@ int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
 int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
@@ -5169,8 +5169,8 @@ uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
 int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
@@ -5185,8 +5185,8 @@ int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
 int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
@@ -5202,8 +5202,8 @@ int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
 int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
@@ -5219,8 +5219,8 @@ int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
 int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
@@ -5443,8 +5443,8 @@ uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
 int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
@@ -5459,8 +5459,8 @@ int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
 int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
@@ -5476,8 +5476,8 @@ int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
 int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
@@ -5493,8 +5493,8 @@ int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
 int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
@@ -5613,8 +5613,8 @@ uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
 int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
@@ -5629,8 +5629,8 @@ int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
 int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
@@ -5646,8 +5646,8 @@ int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
 int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
@@ -5663,8 +5663,8 @@ int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
 int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {

diff  --git a/clang/test/CodeGen/aarch64-neon-fp16fml.c b/clang/test/CodeGen/aarch64-neon-fp16fml.c
index 9645caadad16..50bb629f7d47 100644
--- a/clang/test/CodeGen/aarch64-neon-fp16fml.c
+++ b/clang/test/CodeGen/aarch64-neon-fp16fml.c
@@ -15,7 +15,7 @@
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #3
+// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #[[ATTR3:[0-9]+]]
 // CHECK-NEXT:    ret <2 x float> [[VFMLAL_LOW3_I]]
 //
 float32x2_t test_vfmlal_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -27,7 +27,7 @@ float32x2_t test_vfmlal_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #3
+// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <2 x float> [[VFMLSL_LOW3_I]]
 //
 float32x2_t test_vfmlsl_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -39,7 +39,7 @@ float32x2_t test_vfmlsl_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #3
+// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <2 x float> [[VFMLAL_HIGH3_I]]
 //
 float32x2_t test_vfmlal_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -51,7 +51,7 @@ float32x2_t test_vfmlal_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #3
+// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <2 x float> [[VFMLSL_HIGH3_I]]
 //
 float32x2_t test_vfmlsl_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -63,7 +63,7 @@ float32x2_t test_vfmlsl_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #3
+// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <4 x float> [[VFMLAL_LOW3_I]]
 //
 float32x4_t test_vfmlalq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@@ -75,7 +75,7 @@ float32x4_t test_vfmlalq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #3
+// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <4 x float> [[VFMLSL_LOW3_I]]
 //
 float32x4_t test_vfmlslq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@@ -87,7 +87,7 @@ float32x4_t test_vfmlslq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #3
+// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <4 x float> [[VFMLAL_HIGH3_I]]
 //
 float32x4_t test_vfmlalq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@@ -99,7 +99,7 @@ float32x4_t test_vfmlalq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #3
+// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <4 x float> [[VFMLSL_HIGH3_I]]
 //
 float32x4_t test_vfmlslq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@@ -110,50 +110,50 @@ float32x4_t test_vfmlslq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
 
 // CHECK-LABEL: @test_vfmlal_lane_low_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_716:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_716:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_7164:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_7165:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71614:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71615:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71624:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71625:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>*
+// CHECK-NEXT:    [[__REINT_851:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_851:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_8514:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_8515:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85114:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85115:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85124:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85125:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_851]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_851]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
 // CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
-// CHECK-NEXT:    store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE]], i16* [[__REINT1_851]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_851]] to half*
 // CHECK-NEXT:    [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
 // CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_8514]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_8514]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8
 // CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0
-// CHECK-NEXT:    store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE8]], i16* [[__REINT1_8515]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_8515]] to half*
 // CHECK-NEXT:    [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
 // CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85114]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_85114]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8
 // CHECK-NEXT:    [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 0
-// CHECK-NEXT:    store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE18]], i16* [[__REINT1_85115]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_85115]] to half*
 // CHECK-NEXT:    [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
 // CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8
-// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85124]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_85124]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8
 // CHECK-NEXT:    [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 0
-// CHECK-NEXT:    store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2
-// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE28]], i16* [[__REINT1_85125]], align 2
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_85125]] to half*
 // CHECK-NEXT:    [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
 // CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
 // CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3
+// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <2 x float> [[VFMLAL_LOW3_I]]
 //
 float32x2_t test_vfmlal_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -162,50 +162,50 @@ float32x2_t test_vfmlal_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c
 
 // CHECK-LABEL: @test_vfmlal_lane_high_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_716:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_716:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_7164:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_7165:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71614:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71615:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71624:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71625:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>*
+// CHECK-NEXT:    [[__REINT_851:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_851:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_8514:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_8515:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85114:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85115:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85124:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85125:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_851]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_851]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
 // CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
-// CHECK-NEXT:    store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE]], i16* [[__REINT1_851]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_851]] to half*
 // CHECK-NEXT:    [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
 // CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_8514]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_8514]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8
 // CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1
-// CHECK-NEXT:    store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE8]], i16* [[__REINT1_8515]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_8515]] to half*
 // CHECK-NEXT:    [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
 // CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85114]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_85114]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8
 // CHECK-NEXT:    [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 1
-// CHECK-NEXT:    store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE18]], i16* [[__REINT1_85115]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_85115]] to half*
 // CHECK-NEXT:    [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
 // CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8
-// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85124]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_85124]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8
 // CHECK-NEXT:    [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 1
-// CHECK-NEXT:    store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2
-// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE28]], i16* [[__REINT1_85125]], align 2
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_85125]] to half*
 // CHECK-NEXT:    [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
 // CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
 // CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3
+// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <2 x float> [[VFMLAL_HIGH3_I]]
 //
 float32x2_t test_vfmlal_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -214,90 +214,90 @@ float32x2_t test_vfmlal_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t
 
 // CHECK-LABEL: @test_vfmlalq_lane_low_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_716:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_716:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_7164:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_7165:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71614:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71615:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71624:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71625:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71634:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71635:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71644:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71645:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71654:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71655:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71664:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71665:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>*
+// CHECK-NEXT:    [[__REINT_851:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_851:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_8514:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_8515:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85114:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85115:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85124:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85125:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85134:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85135:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85144:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85145:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85154:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85155:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85164:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85165:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_851]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_851]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
 // CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE]], i16* [[__REINT1_851]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_851]] to half*
 // CHECK-NEXT:    [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
 // CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_8514]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_8514]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8
 // CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE8]], i16* [[__REINT1_8515]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_8515]] to half*
 // CHECK-NEXT:    [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
 // CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85114]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_85114]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8
 // CHECK-NEXT:    [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE18]], i16* [[__REINT1_85115]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_85115]] to half*
 // CHECK-NEXT:    [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
 // CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8
-// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85124]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_85124]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8
 // CHECK-NEXT:    [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2
-// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE28]], i16* [[__REINT1_85125]], align 2
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_85125]] to half*
 // CHECK-NEXT:    [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
 // CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71634]], align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x half>* [[__REINT_71634]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85134]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x half>* [[__REINT_85134]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP17:%.*]] = load <4 x i16>, <4 x i16>* [[TMP16]], align 8
 // CHECK-NEXT:    [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP17]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE38]], i16* [[__REINT1_71635]], align 2
-// CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16* [[__REINT1_71635]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE38]], i16* [[__REINT1_85135]], align 2
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16* [[__REINT1_85135]] to half*
 // CHECK-NEXT:    [[TMP19:%.*]] = load half, half* [[TMP18]], align 2
 // CHECK-NEXT:    [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71644]], align 8
-// CHECK-NEXT:    [[TMP20:%.*]] = bitcast <4 x half>* [[__REINT_71644]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85144]], align 8
+// CHECK-NEXT:    [[TMP20:%.*]] = bitcast <4 x half>* [[__REINT_85144]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i16>, <4 x i16>* [[TMP20]], align 8
 // CHECK-NEXT:    [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP21]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE48]], i16* [[__REINT1_71645]], align 2
-// CHECK-NEXT:    [[TMP22:%.*]] = bitcast i16* [[__REINT1_71645]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE48]], i16* [[__REINT1_85145]], align 2
+// CHECK-NEXT:    [[TMP22:%.*]] = bitcast i16* [[__REINT1_85145]] to half*
 // CHECK-NEXT:    [[TMP23:%.*]] = load half, half* [[TMP22]], align 2
 // CHECK-NEXT:    [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71654]], align 8
-// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x half>* [[__REINT_71654]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85154]], align 8
+// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x half>* [[__REINT_85154]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i16>, <4 x i16>* [[TMP24]], align 8
 // CHECK-NEXT:    [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP25]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE58]], i16* [[__REINT1_71655]], align 2
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast i16* [[__REINT1_71655]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE58]], i16* [[__REINT1_85155]], align 2
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast i16* [[__REINT1_85155]] to half*
 // CHECK-NEXT:    [[TMP27:%.*]] = load half, half* [[TMP26]], align 2
 // CHECK-NEXT:    [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71664]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x half>* [[__REINT_71664]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85164]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x half>* [[__REINT_85164]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP29:%.*]] = load <4 x i16>, <4 x i16>* [[TMP28]], align 8
 // CHECK-NEXT:    [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP29]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE68]], i16* [[__REINT1_71665]], align 2
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast i16* [[__REINT1_71665]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE68]], i16* [[__REINT1_85165]], align 2
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast i16* [[__REINT1_85165]] to half*
 // CHECK-NEXT:    [[TMP31:%.*]] = load half, half* [[TMP30]], align 2
 // CHECK-NEXT:    [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
 // CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3
+// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <4 x float> [[VFMLAL_LOW3_I]]
 //
 float32x4_t test_vfmlalq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
@@ -306,90 +306,90 @@ float32x4_t test_vfmlalq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t
 
 // CHECK-LABEL: @test_vfmlalq_lane_high_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_716:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_716:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_7164:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_7165:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71614:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71615:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71624:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71625:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71634:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71635:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71644:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71645:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71654:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71655:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71664:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71665:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>*
+// CHECK-NEXT:    [[__REINT_851:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_851:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_8514:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_8515:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85114:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85115:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85124:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85125:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85134:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85135:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85144:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85145:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85154:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85155:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85164:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85165:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_851]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_851]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
 // CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE]], i16* [[__REINT1_851]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_851]] to half*
 // CHECK-NEXT:    [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
 // CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_8514]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_8514]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8
 // CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE8]], i16* [[__REINT1_8515]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_8515]] to half*
 // CHECK-NEXT:    [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
 // CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85114]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_85114]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8
 // CHECK-NEXT:    [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE18]], i16* [[__REINT1_85115]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_85115]] to half*
 // CHECK-NEXT:    [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
 // CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8
-// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85124]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_85124]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8
 // CHECK-NEXT:    [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2
-// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE28]], i16* [[__REINT1_85125]], align 2
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_85125]] to half*
 // CHECK-NEXT:    [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
 // CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71634]], align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x half>* [[__REINT_71634]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85134]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x half>* [[__REINT_85134]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP17:%.*]] = load <4 x i16>, <4 x i16>* [[TMP16]], align 8
 // CHECK-NEXT:    [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP17]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE38]], i16* [[__REINT1_71635]], align 2
-// CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16* [[__REINT1_71635]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE38]], i16* [[__REINT1_85135]], align 2
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16* [[__REINT1_85135]] to half*
 // CHECK-NEXT:    [[TMP19:%.*]] = load half, half* [[TMP18]], align 2
 // CHECK-NEXT:    [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71644]], align 8
-// CHECK-NEXT:    [[TMP20:%.*]] = bitcast <4 x half>* [[__REINT_71644]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85144]], align 8
+// CHECK-NEXT:    [[TMP20:%.*]] = bitcast <4 x half>* [[__REINT_85144]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i16>, <4 x i16>* [[TMP20]], align 8
 // CHECK-NEXT:    [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP21]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE48]], i16* [[__REINT1_71645]], align 2
-// CHECK-NEXT:    [[TMP22:%.*]] = bitcast i16* [[__REINT1_71645]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE48]], i16* [[__REINT1_85145]], align 2
+// CHECK-NEXT:    [[TMP22:%.*]] = bitcast i16* [[__REINT1_85145]] to half*
 // CHECK-NEXT:    [[TMP23:%.*]] = load half, half* [[TMP22]], align 2
 // CHECK-NEXT:    [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71654]], align 8
-// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x half>* [[__REINT_71654]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85154]], align 8
+// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x half>* [[__REINT_85154]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i16>, <4 x i16>* [[TMP24]], align 8
 // CHECK-NEXT:    [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP25]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE58]], i16* [[__REINT1_71655]], align 2
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast i16* [[__REINT1_71655]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE58]], i16* [[__REINT1_85155]], align 2
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast i16* [[__REINT1_85155]] to half*
 // CHECK-NEXT:    [[TMP27:%.*]] = load half, half* [[TMP26]], align 2
 // CHECK-NEXT:    [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71664]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x half>* [[__REINT_71664]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85164]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x half>* [[__REINT_85164]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP29:%.*]] = load <4 x i16>, <4 x i16>* [[TMP28]], align 8
 // CHECK-NEXT:    [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP29]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE68]], i16* [[__REINT1_71665]], align 2
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast i16* [[__REINT1_71665]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE68]], i16* [[__REINT1_85165]], align 2
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast i16* [[__REINT1_85165]] to half*
 // CHECK-NEXT:    [[TMP31:%.*]] = load half, half* [[TMP30]], align 2
 // CHECK-NEXT:    [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
 // CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3
+// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <4 x float> [[VFMLAL_HIGH3_I]]
 //
 float32x4_t test_vfmlalq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
@@ -398,50 +398,50 @@ float32x4_t test_vfmlalq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t
 
 // CHECK-LABEL: @test_vfmlal_laneq_low_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_719:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_719:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_7194:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_7195:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71914:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71915:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71924:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71925:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>*
+// CHECK-NEXT:    [[__REINT_854:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_854:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_8544:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_8545:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85414:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85415:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85424:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85425:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_854]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_854]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
-// CHECK-NEXT:    store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE]], i16* [[__REINT1_854]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_854]] to half*
 // CHECK-NEXT:    [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
 // CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_8544]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_8544]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4
-// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], i16* [[__REINT1_8545]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_8545]] to half*
 // CHECK-NEXT:    [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
 // CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85414]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_85414]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 4
-// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], i16* [[__REINT1_85415]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_85415]] to half*
 // CHECK-NEXT:    [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
 // CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16
-// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85424]], align 16
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_85424]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 4
-// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2
-// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], i16* [[__REINT1_85425]], align 2
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_85425]] to half*
 // CHECK-NEXT:    [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
 // CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
 // CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3
+// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <2 x float> [[VFMLAL_LOW3_I]]
 //
 float32x2_t test_vfmlal_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
@@ -450,50 +450,50 @@ float32x2_t test_vfmlal_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t
 
 // CHECK-LABEL: @test_vfmlal_laneq_high_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_719:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_719:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_7194:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_7195:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71914:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71915:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71924:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71925:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>*
+// CHECK-NEXT:    [[__REINT_854:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_854:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_8544:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_8545:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85414:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85415:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85424:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85425:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_854]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_854]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
-// CHECK-NEXT:    store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE]], i16* [[__REINT1_854]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_854]] to half*
 // CHECK-NEXT:    [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
 // CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_8544]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_8544]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5
-// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], i16* [[__REINT1_8545]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_8545]] to half*
 // CHECK-NEXT:    [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
 // CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85414]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_85414]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 5
-// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], i16* [[__REINT1_85415]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_85415]] to half*
 // CHECK-NEXT:    [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
 // CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16
-// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85424]], align 16
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_85424]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 5
-// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2
-// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], i16* [[__REINT1_85425]], align 2
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_85425]] to half*
 // CHECK-NEXT:    [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
 // CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
 // CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3
+// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <2 x float> [[VFMLAL_HIGH3_I]]
 //
 float32x2_t test_vfmlal_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
@@ -502,90 +502,90 @@ float32x2_t test_vfmlal_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t
 
 // CHECK-LABEL: @test_vfmlalq_laneq_low_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_719:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_719:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_7194:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_7195:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71914:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71915:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71924:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71925:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71934:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71935:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71944:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71945:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71954:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71955:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71964:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71965:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>*
+// CHECK-NEXT:    [[__REINT_854:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_854:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_8544:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_8545:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85414:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85415:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85424:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85425:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85434:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85435:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85444:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85445:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85454:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85455:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85464:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85465:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_854]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_854]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE]], i16* [[__REINT1_854]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_854]] to half*
 // CHECK-NEXT:    [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
 // CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_8544]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_8544]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], i16* [[__REINT1_8545]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_8545]] to half*
 // CHECK-NEXT:    [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
 // CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85414]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_85414]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], i16* [[__REINT1_85415]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_85415]] to half*
 // CHECK-NEXT:    [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
 // CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16
-// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85424]], align 16
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_85424]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2
-// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], i16* [[__REINT1_85425]], align 2
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_85425]] to half*
 // CHECK-NEXT:    [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
 // CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71934]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x half>* [[__REINT_71934]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85434]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x half>* [[__REINT_85434]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP17:%.*]] = load <8 x i16>, <8 x i16>* [[TMP16]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP17]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE38]], i16* [[__REINT1_71935]], align 2
-// CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16* [[__REINT1_71935]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE38]], i16* [[__REINT1_85435]], align 2
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16* [[__REINT1_85435]] to half*
 // CHECK-NEXT:    [[TMP19:%.*]] = load half, half* [[TMP18]], align 2
 // CHECK-NEXT:    [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71944]], align 16
-// CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x half>* [[__REINT_71944]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85444]], align 16
+// CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x half>* [[__REINT_85444]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i16>, <8 x i16>* [[TMP20]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP21]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE48]], i16* [[__REINT1_71945]], align 2
-// CHECK-NEXT:    [[TMP22:%.*]] = bitcast i16* [[__REINT1_71945]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE48]], i16* [[__REINT1_85445]], align 2
+// CHECK-NEXT:    [[TMP22:%.*]] = bitcast i16* [[__REINT1_85445]] to half*
 // CHECK-NEXT:    [[TMP23:%.*]] = load half, half* [[TMP22]], align 2
 // CHECK-NEXT:    [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71954]], align 16
-// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x half>* [[__REINT_71954]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85454]], align 16
+// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x half>* [[__REINT_85454]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i16>, <8 x i16>* [[TMP24]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP25]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE58]], i16* [[__REINT1_71955]], align 2
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast i16* [[__REINT1_71955]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE58]], i16* [[__REINT1_85455]], align 2
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast i16* [[__REINT1_85455]] to half*
 // CHECK-NEXT:    [[TMP27:%.*]] = load half, half* [[TMP26]], align 2
 // CHECK-NEXT:    [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71964]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <8 x half>* [[__REINT_71964]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85464]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <8 x half>* [[__REINT_85464]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i16>, <8 x i16>* [[TMP28]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP29]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE68]], i16* [[__REINT1_71965]], align 2
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast i16* [[__REINT1_71965]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE68]], i16* [[__REINT1_85465]], align 2
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast i16* [[__REINT1_85465]] to half*
 // CHECK-NEXT:    [[TMP31:%.*]] = load half, half* [[TMP30]], align 2
 // CHECK-NEXT:    [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
 // CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3
+// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <4 x float> [[VFMLAL_LOW3_I]]
 //
 float32x4_t test_vfmlalq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@@ -594,90 +594,90 @@ float32x4_t test_vfmlalq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t
 
 // CHECK-LABEL: @test_vfmlalq_laneq_high_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_719:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_719:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_7194:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_7195:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71914:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71915:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71924:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71925:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71934:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71935:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71944:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71945:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71954:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71955:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71964:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71965:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>*
+// CHECK-NEXT:    [[__REINT_854:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_854:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_8544:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_8545:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85414:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85415:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85424:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85425:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85434:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85435:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85444:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85445:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85454:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85455:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85464:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85465:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_854]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_854]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE]], i16* [[__REINT1_854]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_854]] to half*
 // CHECK-NEXT:    [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
 // CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_8544]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_8544]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], i16* [[__REINT1_8545]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_8545]] to half*
 // CHECK-NEXT:    [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
 // CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85414]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_85414]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], i16* [[__REINT1_85415]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_85415]] to half*
 // CHECK-NEXT:    [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
 // CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16
-// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85424]], align 16
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_85424]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2
-// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], i16* [[__REINT1_85425]], align 2
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_85425]] to half*
 // CHECK-NEXT:    [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
 // CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71934]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x half>* [[__REINT_71934]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85434]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x half>* [[__REINT_85434]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP17:%.*]] = load <8 x i16>, <8 x i16>* [[TMP16]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP17]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE38]], i16* [[__REINT1_71935]], align 2
-// CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16* [[__REINT1_71935]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE38]], i16* [[__REINT1_85435]], align 2
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16* [[__REINT1_85435]] to half*
 // CHECK-NEXT:    [[TMP19:%.*]] = load half, half* [[TMP18]], align 2
 // CHECK-NEXT:    [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71944]], align 16
-// CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x half>* [[__REINT_71944]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85444]], align 16
+// CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x half>* [[__REINT_85444]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i16>, <8 x i16>* [[TMP20]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP21]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE48]], i16* [[__REINT1_71945]], align 2
-// CHECK-NEXT:    [[TMP22:%.*]] = bitcast i16* [[__REINT1_71945]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE48]], i16* [[__REINT1_85445]], align 2
+// CHECK-NEXT:    [[TMP22:%.*]] = bitcast i16* [[__REINT1_85445]] to half*
 // CHECK-NEXT:    [[TMP23:%.*]] = load half, half* [[TMP22]], align 2
 // CHECK-NEXT:    [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71954]], align 16
-// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x half>* [[__REINT_71954]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85454]], align 16
+// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x half>* [[__REINT_85454]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i16>, <8 x i16>* [[TMP24]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP25]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE58]], i16* [[__REINT1_71955]], align 2
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast i16* [[__REINT1_71955]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE58]], i16* [[__REINT1_85455]], align 2
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast i16* [[__REINT1_85455]] to half*
 // CHECK-NEXT:    [[TMP27:%.*]] = load half, half* [[TMP26]], align 2
 // CHECK-NEXT:    [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71964]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <8 x half>* [[__REINT_71964]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85464]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <8 x half>* [[__REINT_85464]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i16>, <8 x i16>* [[TMP28]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP29]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE68]], i16* [[__REINT1_71965]], align 2
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast i16* [[__REINT1_71965]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE68]], i16* [[__REINT1_85465]], align 2
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast i16* [[__REINT1_85465]] to half*
 // CHECK-NEXT:    [[TMP31:%.*]] = load half, half* [[TMP30]], align 2
 // CHECK-NEXT:    [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
 // CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3
+// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <4 x float> [[VFMLAL_HIGH3_I]]
 //
 float32x4_t test_vfmlalq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@@ -686,50 +686,50 @@ float32x4_t test_vfmlalq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_
 
 // CHECK-LABEL: @test_vfmlsl_lane_low_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_716:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_716:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_7164:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_7165:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71614:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71615:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71624:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71625:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>*
+// CHECK-NEXT:    [[__REINT_851:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_851:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_8514:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_8515:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85114:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85115:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85124:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85125:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_851]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_851]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
 // CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
-// CHECK-NEXT:    store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE]], i16* [[__REINT1_851]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_851]] to half*
 // CHECK-NEXT:    [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
 // CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_8514]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_8514]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8
 // CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0
-// CHECK-NEXT:    store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE8]], i16* [[__REINT1_8515]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_8515]] to half*
 // CHECK-NEXT:    [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
 // CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85114]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_85114]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8
 // CHECK-NEXT:    [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 0
-// CHECK-NEXT:    store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE18]], i16* [[__REINT1_85115]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_85115]] to half*
 // CHECK-NEXT:    [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
 // CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8
-// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85124]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_85124]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8
 // CHECK-NEXT:    [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 0
-// CHECK-NEXT:    store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2
-// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE28]], i16* [[__REINT1_85125]], align 2
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_85125]] to half*
 // CHECK-NEXT:    [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
 // CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
 // CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3
+// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <2 x float> [[VFMLSL_LOW3_I]]
 //
 float32x2_t test_vfmlsl_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -738,50 +738,50 @@ float32x2_t test_vfmlsl_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c
 
 // CHECK-LABEL: @test_vfmlsl_lane_high_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_716:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_716:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_7164:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_7165:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71614:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71615:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71624:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71625:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>*
+// CHECK-NEXT:    [[__REINT_851:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_851:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_8514:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_8515:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85114:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85115:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85124:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85125:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_851]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_851]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
 // CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
-// CHECK-NEXT:    store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE]], i16* [[__REINT1_851]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_851]] to half*
 // CHECK-NEXT:    [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
 // CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_8514]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_8514]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8
 // CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1
-// CHECK-NEXT:    store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE8]], i16* [[__REINT1_8515]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_8515]] to half*
 // CHECK-NEXT:    [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
 // CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85114]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_85114]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8
 // CHECK-NEXT:    [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 1
-// CHECK-NEXT:    store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE18]], i16* [[__REINT1_85115]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_85115]] to half*
 // CHECK-NEXT:    [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
 // CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8
-// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85124]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_85124]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8
 // CHECK-NEXT:    [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 1
-// CHECK-NEXT:    store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2
-// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE28]], i16* [[__REINT1_85125]], align 2
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_85125]] to half*
 // CHECK-NEXT:    [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
 // CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
 // CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3
+// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <2 x float> [[VFMLSL_HIGH3_I]]
 //
 float32x2_t test_vfmlsl_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -790,90 +790,90 @@ float32x2_t test_vfmlsl_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t
 
 // CHECK-LABEL: @test_vfmlslq_lane_low_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_716:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_716:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_7164:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_7165:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71614:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71615:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71624:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71625:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71634:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71635:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71644:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71645:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71654:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71655:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71664:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71665:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>*
+// CHECK-NEXT:    [[__REINT_851:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_851:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_8514:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_8515:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85114:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85115:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85124:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85125:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85134:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85135:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85144:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85145:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85154:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85155:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85164:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85165:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_851]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_851]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
 // CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE]], i16* [[__REINT1_851]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_851]] to half*
 // CHECK-NEXT:    [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
 // CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_8514]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_8514]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8
 // CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE8]], i16* [[__REINT1_8515]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_8515]] to half*
 // CHECK-NEXT:    [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
 // CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85114]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_85114]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8
 // CHECK-NEXT:    [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE18]], i16* [[__REINT1_85115]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_85115]] to half*
 // CHECK-NEXT:    [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
 // CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8
-// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85124]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_85124]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8
 // CHECK-NEXT:    [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2
-// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE28]], i16* [[__REINT1_85125]], align 2
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_85125]] to half*
 // CHECK-NEXT:    [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
 // CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71634]], align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x half>* [[__REINT_71634]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85134]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x half>* [[__REINT_85134]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP17:%.*]] = load <4 x i16>, <4 x i16>* [[TMP16]], align 8
 // CHECK-NEXT:    [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP17]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE38]], i16* [[__REINT1_71635]], align 2
-// CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16* [[__REINT1_71635]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE38]], i16* [[__REINT1_85135]], align 2
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16* [[__REINT1_85135]] to half*
 // CHECK-NEXT:    [[TMP19:%.*]] = load half, half* [[TMP18]], align 2
 // CHECK-NEXT:    [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71644]], align 8
-// CHECK-NEXT:    [[TMP20:%.*]] = bitcast <4 x half>* [[__REINT_71644]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85144]], align 8
+// CHECK-NEXT:    [[TMP20:%.*]] = bitcast <4 x half>* [[__REINT_85144]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i16>, <4 x i16>* [[TMP20]], align 8
 // CHECK-NEXT:    [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP21]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE48]], i16* [[__REINT1_71645]], align 2
-// CHECK-NEXT:    [[TMP22:%.*]] = bitcast i16* [[__REINT1_71645]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE48]], i16* [[__REINT1_85145]], align 2
+// CHECK-NEXT:    [[TMP22:%.*]] = bitcast i16* [[__REINT1_85145]] to half*
 // CHECK-NEXT:    [[TMP23:%.*]] = load half, half* [[TMP22]], align 2
 // CHECK-NEXT:    [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71654]], align 8
-// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x half>* [[__REINT_71654]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85154]], align 8
+// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x half>* [[__REINT_85154]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i16>, <4 x i16>* [[TMP24]], align 8
 // CHECK-NEXT:    [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP25]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE58]], i16* [[__REINT1_71655]], align 2
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast i16* [[__REINT1_71655]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE58]], i16* [[__REINT1_85155]], align 2
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast i16* [[__REINT1_85155]] to half*
 // CHECK-NEXT:    [[TMP27:%.*]] = load half, half* [[TMP26]], align 2
 // CHECK-NEXT:    [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71664]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x half>* [[__REINT_71664]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85164]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x half>* [[__REINT_85164]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP29:%.*]] = load <4 x i16>, <4 x i16>* [[TMP28]], align 8
 // CHECK-NEXT:    [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP29]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE68]], i16* [[__REINT1_71665]], align 2
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast i16* [[__REINT1_71665]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE68]], i16* [[__REINT1_85165]], align 2
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast i16* [[__REINT1_85165]] to half*
 // CHECK-NEXT:    [[TMP31:%.*]] = load half, half* [[TMP30]], align 2
 // CHECK-NEXT:    [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
 // CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3
+// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <4 x float> [[VFMLSL_LOW3_I]]
 //
 float32x4_t test_vfmlslq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
@@ -882,90 +882,90 @@ float32x4_t test_vfmlslq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t
 
 // CHECK-LABEL: @test_vfmlslq_lane_high_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_716:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_716:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_7164:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_7165:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71614:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71615:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71624:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71625:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71634:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71635:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71644:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71645:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71654:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71655:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71664:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_71665:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>*
+// CHECK-NEXT:    [[__REINT_851:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_851:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_8514:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_8515:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85114:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85115:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85124:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85125:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85134:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85135:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85144:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85145:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85154:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85155:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85164:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_85165:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_851]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_851]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
 // CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE]], i16* [[__REINT1_851]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_851]] to half*
 // CHECK-NEXT:    [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
 // CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_8514]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_8514]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8
 // CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE8]], i16* [[__REINT1_8515]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_8515]] to half*
 // CHECK-NEXT:    [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
 // CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85114]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_85114]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8
 // CHECK-NEXT:    [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE18]], i16* [[__REINT1_85115]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_85115]] to half*
 // CHECK-NEXT:    [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
 // CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8
-// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85124]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_85124]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8
 // CHECK-NEXT:    [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2
-// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE28]], i16* [[__REINT1_85125]], align 2
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_85125]] to half*
 // CHECK-NEXT:    [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
 // CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71634]], align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x half>* [[__REINT_71634]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85134]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x half>* [[__REINT_85134]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP17:%.*]] = load <4 x i16>, <4 x i16>* [[TMP16]], align 8
 // CHECK-NEXT:    [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP17]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE38]], i16* [[__REINT1_71635]], align 2
-// CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16* [[__REINT1_71635]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE38]], i16* [[__REINT1_85135]], align 2
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16* [[__REINT1_85135]] to half*
 // CHECK-NEXT:    [[TMP19:%.*]] = load half, half* [[TMP18]], align 2
 // CHECK-NEXT:    [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71644]], align 8
-// CHECK-NEXT:    [[TMP20:%.*]] = bitcast <4 x half>* [[__REINT_71644]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85144]], align 8
+// CHECK-NEXT:    [[TMP20:%.*]] = bitcast <4 x half>* [[__REINT_85144]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i16>, <4 x i16>* [[TMP20]], align 8
 // CHECK-NEXT:    [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP21]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE48]], i16* [[__REINT1_71645]], align 2
-// CHECK-NEXT:    [[TMP22:%.*]] = bitcast i16* [[__REINT1_71645]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE48]], i16* [[__REINT1_85145]], align 2
+// CHECK-NEXT:    [[TMP22:%.*]] = bitcast i16* [[__REINT1_85145]] to half*
 // CHECK-NEXT:    [[TMP23:%.*]] = load half, half* [[TMP22]], align 2
 // CHECK-NEXT:    [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71654]], align 8
-// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x half>* [[__REINT_71654]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85154]], align 8
+// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x half>* [[__REINT_85154]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i16>, <4 x i16>* [[TMP24]], align 8
 // CHECK-NEXT:    [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP25]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE58]], i16* [[__REINT1_71655]], align 2
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast i16* [[__REINT1_71655]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE58]], i16* [[__REINT1_85155]], align 2
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast i16* [[__REINT1_85155]] to half*
 // CHECK-NEXT:    [[TMP27:%.*]] = load half, half* [[TMP26]], align 2
 // CHECK-NEXT:    [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
-// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_71664]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x half>* [[__REINT_71664]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[C]], <4 x half>* [[__REINT_85164]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x half>* [[__REINT_85164]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP29:%.*]] = load <4 x i16>, <4 x i16>* [[TMP28]], align 8
 // CHECK-NEXT:    [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP29]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE68]], i16* [[__REINT1_71665]], align 2
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast i16* [[__REINT1_71665]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE68]], i16* [[__REINT1_85165]], align 2
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast i16* [[__REINT1_85165]] to half*
 // CHECK-NEXT:    [[TMP31:%.*]] = load half, half* [[TMP30]], align 2
 // CHECK-NEXT:    [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
 // CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3
+// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <4 x float> [[VFMLSL_HIGH3_I]]
 //
 float32x4_t test_vfmlslq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
@@ -974,50 +974,50 @@ float32x4_t test_vfmlslq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t
 
 // CHECK-LABEL: @test_vfmlsl_laneq_low_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_719:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_719:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_7194:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_7195:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71914:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71915:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71924:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71925:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>*
+// CHECK-NEXT:    [[__REINT_854:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_854:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_8544:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_8545:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85414:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85415:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85424:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85425:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_854]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_854]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
-// CHECK-NEXT:    store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE]], i16* [[__REINT1_854]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_854]] to half*
 // CHECK-NEXT:    [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
 // CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_8544]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_8544]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4
-// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], i16* [[__REINT1_8545]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_8545]] to half*
 // CHECK-NEXT:    [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
 // CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85414]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_85414]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 4
-// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], i16* [[__REINT1_85415]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_85415]] to half*
 // CHECK-NEXT:    [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
 // CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16
-// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85424]], align 16
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_85424]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 4
-// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2
-// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], i16* [[__REINT1_85425]], align 2
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_85425]] to half*
 // CHECK-NEXT:    [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
 // CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
 // CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3
+// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <2 x float> [[VFMLSL_LOW3_I]]
 //
 float32x2_t test_vfmlsl_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
@@ -1026,50 +1026,50 @@ float32x2_t test_vfmlsl_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t
 
 // CHECK-LABEL: @test_vfmlsl_laneq_high_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_719:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_719:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_7194:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_7195:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71914:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71915:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71924:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71925:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>*
+// CHECK-NEXT:    [[__REINT_854:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_854:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_8544:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_8545:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85414:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85415:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85424:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85425:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_854]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_854]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
-// CHECK-NEXT:    store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE]], i16* [[__REINT1_854]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_854]] to half*
 // CHECK-NEXT:    [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
 // CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_8544]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_8544]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5
-// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], i16* [[__REINT1_8545]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_8545]] to half*
 // CHECK-NEXT:    [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
 // CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85414]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_85414]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 5
-// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], i16* [[__REINT1_85415]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_85415]] to half*
 // CHECK-NEXT:    [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
 // CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16
-// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85424]], align 16
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_85424]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 5
-// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2
-// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], i16* [[__REINT1_85425]], align 2
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_85425]] to half*
 // CHECK-NEXT:    [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
 // CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
 // CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3
+// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <2 x float> [[VFMLSL_HIGH3_I]]
 //
 float32x2_t test_vfmlsl_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
@@ -1078,90 +1078,90 @@ float32x2_t test_vfmlsl_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t
 
 // CHECK-LABEL: @test_vfmlslq_laneq_low_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_719:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_719:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_7194:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_7195:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71914:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71915:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71924:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71925:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71934:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71935:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71944:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71945:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71954:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71955:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71964:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71965:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>*
+// CHECK-NEXT:    [[__REINT_854:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_854:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_8544:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_8545:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85414:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85415:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85424:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85425:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85434:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85435:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85444:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85445:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85454:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85455:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85464:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85465:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_854]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_854]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE]], i16* [[__REINT1_854]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_854]] to half*
 // CHECK-NEXT:    [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
 // CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_8544]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_8544]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], i16* [[__REINT1_8545]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_8545]] to half*
 // CHECK-NEXT:    [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
 // CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85414]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_85414]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], i16* [[__REINT1_85415]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_85415]] to half*
 // CHECK-NEXT:    [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
 // CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16
-// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85424]], align 16
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_85424]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2
-// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], i16* [[__REINT1_85425]], align 2
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_85425]] to half*
 // CHECK-NEXT:    [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
 // CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71934]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x half>* [[__REINT_71934]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85434]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x half>* [[__REINT_85434]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP17:%.*]] = load <8 x i16>, <8 x i16>* [[TMP16]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP17]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE38]], i16* [[__REINT1_71935]], align 2
-// CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16* [[__REINT1_71935]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE38]], i16* [[__REINT1_85435]], align 2
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16* [[__REINT1_85435]] to half*
 // CHECK-NEXT:    [[TMP19:%.*]] = load half, half* [[TMP18]], align 2
 // CHECK-NEXT:    [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71944]], align 16
-// CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x half>* [[__REINT_71944]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85444]], align 16
+// CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x half>* [[__REINT_85444]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i16>, <8 x i16>* [[TMP20]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP21]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE48]], i16* [[__REINT1_71945]], align 2
-// CHECK-NEXT:    [[TMP22:%.*]] = bitcast i16* [[__REINT1_71945]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE48]], i16* [[__REINT1_85445]], align 2
+// CHECK-NEXT:    [[TMP22:%.*]] = bitcast i16* [[__REINT1_85445]] to half*
 // CHECK-NEXT:    [[TMP23:%.*]] = load half, half* [[TMP22]], align 2
 // CHECK-NEXT:    [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71954]], align 16
-// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x half>* [[__REINT_71954]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85454]], align 16
+// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x half>* [[__REINT_85454]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i16>, <8 x i16>* [[TMP24]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP25]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE58]], i16* [[__REINT1_71955]], align 2
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast i16* [[__REINT1_71955]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE58]], i16* [[__REINT1_85455]], align 2
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast i16* [[__REINT1_85455]] to half*
 // CHECK-NEXT:    [[TMP27:%.*]] = load half, half* [[TMP26]], align 2
 // CHECK-NEXT:    [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71964]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <8 x half>* [[__REINT_71964]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85464]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <8 x half>* [[__REINT_85464]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i16>, <8 x i16>* [[TMP28]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP29]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE68]], i16* [[__REINT1_71965]], align 2
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast i16* [[__REINT1_71965]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE68]], i16* [[__REINT1_85465]], align 2
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast i16* [[__REINT1_85465]] to half*
 // CHECK-NEXT:    [[TMP31:%.*]] = load half, half* [[TMP30]], align 2
 // CHECK-NEXT:    [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
 // CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3
+// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <4 x float> [[VFMLSL_LOW3_I]]
 //
 float32x4_t test_vfmlslq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@@ -1170,90 +1170,90 @@ float32x4_t test_vfmlslq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t
 
 // CHECK-LABEL: @test_vfmlslq_laneq_high_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_719:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_719:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_7194:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_7195:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71914:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71915:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71924:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71925:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71934:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71935:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71944:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71945:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71954:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71955:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_71964:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_71965:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>*
+// CHECK-NEXT:    [[__REINT_854:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_854:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_8544:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_8545:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85414:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85415:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85424:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85425:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85434:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85435:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85444:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85445:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85454:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85455:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_85464:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_85465:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_854]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_854]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE]], i16* [[__REINT1_854]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_854]] to half*
 // CHECK-NEXT:    [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
 // CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_8544]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_8544]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], i16* [[__REINT1_8545]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[__REINT1_8545]] to half*
 // CHECK-NEXT:    [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
 // CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85414]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_85414]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], i16* [[__REINT1_85415]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[__REINT1_85415]] to half*
 // CHECK-NEXT:    [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
 // CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16
-// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85424]], align 16
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_85424]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2
-// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], i16* [[__REINT1_85425]], align 2
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16* [[__REINT1_85425]] to half*
 // CHECK-NEXT:    [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
 // CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71934]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x half>* [[__REINT_71934]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85434]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x half>* [[__REINT_85434]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP17:%.*]] = load <8 x i16>, <8 x i16>* [[TMP16]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP17]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE38]], i16* [[__REINT1_71935]], align 2
-// CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16* [[__REINT1_71935]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE38]], i16* [[__REINT1_85435]], align 2
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16* [[__REINT1_85435]] to half*
 // CHECK-NEXT:    [[TMP19:%.*]] = load half, half* [[TMP18]], align 2
 // CHECK-NEXT:    [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71944]], align 16
-// CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x half>* [[__REINT_71944]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85444]], align 16
+// CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x half>* [[__REINT_85444]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i16>, <8 x i16>* [[TMP20]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP21]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE48]], i16* [[__REINT1_71945]], align 2
-// CHECK-NEXT:    [[TMP22:%.*]] = bitcast i16* [[__REINT1_71945]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE48]], i16* [[__REINT1_85445]], align 2
+// CHECK-NEXT:    [[TMP22:%.*]] = bitcast i16* [[__REINT1_85445]] to half*
 // CHECK-NEXT:    [[TMP23:%.*]] = load half, half* [[TMP22]], align 2
 // CHECK-NEXT:    [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71954]], align 16
-// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x half>* [[__REINT_71954]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85454]], align 16
+// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x half>* [[__REINT_85454]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i16>, <8 x i16>* [[TMP24]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP25]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE58]], i16* [[__REINT1_71955]], align 2
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast i16* [[__REINT1_71955]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE58]], i16* [[__REINT1_85455]], align 2
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast i16* [[__REINT1_85455]] to half*
 // CHECK-NEXT:    [[TMP27:%.*]] = load half, half* [[TMP26]], align 2
 // CHECK-NEXT:    [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
-// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_71964]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <8 x half>* [[__REINT_71964]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[C]], <8 x half>* [[__REINT_85464]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <8 x half>* [[__REINT_85464]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i16>, <8 x i16>* [[TMP28]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP29]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE68]], i16* [[__REINT1_71965]], align 2
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast i16* [[__REINT1_71965]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE68]], i16* [[__REINT1_85465]], align 2
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast i16* [[__REINT1_85465]] to half*
 // CHECK-NEXT:    [[TMP31:%.*]] = load half, half* [[TMP30]], align 2
 // CHECK-NEXT:    [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
 // CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3
+// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
 // CHECK-NEXT:    ret <4 x float> [[VFMLSL_HIGH3_I]]
 //
 float32x4_t test_vfmlslq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {

diff  --git a/clang/test/CodeGen/arm_acle.c b/clang/test/CodeGen/arm_acle.c
index 7fbaf36aec3d..9a731742e946 100644
--- a/clang/test/CodeGen/arm_acle.c
+++ b/clang/test/CodeGen/arm_acle.c
@@ -56,12 +56,12 @@ void test_isb(void) {
 /* 8.4 Hints */
 // AArch32-LABEL: @test_yield(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    call void @llvm.arm.hint(i32 1) [[ATTR1:#.*]]
+// AArch32-NEXT:    call void @llvm.arm.hint(i32 1) #[[ATTR1:[0-9]+]]
 // AArch32-NEXT:    ret void
 //
 // AArch64-LABEL: @test_yield(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    call void @llvm.aarch64.hint(i32 1) [[ATTR3:#.*]]
+// AArch64-NEXT:    call void @llvm.aarch64.hint(i32 1) #[[ATTR3:[0-9]+]]
 // AArch64-NEXT:    ret void
 //
 void test_yield(void) {
@@ -70,12 +70,12 @@ void test_yield(void) {
 
 // AArch32-LABEL: @test_wfe(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    call void @llvm.arm.hint(i32 2) [[ATTR1]]
+// AArch32-NEXT:    call void @llvm.arm.hint(i32 2) #[[ATTR1]]
 // AArch32-NEXT:    ret void
 //
 // AArch64-LABEL: @test_wfe(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    call void @llvm.aarch64.hint(i32 2) [[ATTR3]]
+// AArch64-NEXT:    call void @llvm.aarch64.hint(i32 2) #[[ATTR3]]
 // AArch64-NEXT:    ret void
 //
 void test_wfe(void) {
@@ -84,12 +84,12 @@ void test_wfe(void) {
 
 // AArch32-LABEL: @test_wfi(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    call void @llvm.arm.hint(i32 3) [[ATTR1]]
+// AArch32-NEXT:    call void @llvm.arm.hint(i32 3) #[[ATTR1]]
 // AArch32-NEXT:    ret void
 //
 // AArch64-LABEL: @test_wfi(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    call void @llvm.aarch64.hint(i32 3) [[ATTR3]]
+// AArch64-NEXT:    call void @llvm.aarch64.hint(i32 3) #[[ATTR3]]
 // AArch64-NEXT:    ret void
 //
 void test_wfi(void) {
@@ -98,12 +98,12 @@ void test_wfi(void) {
 
 // AArch32-LABEL: @test_sev(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    call void @llvm.arm.hint(i32 4) [[ATTR1]]
+// AArch32-NEXT:    call void @llvm.arm.hint(i32 4) #[[ATTR1]]
 // AArch32-NEXT:    ret void
 //
 // AArch64-LABEL: @test_sev(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    call void @llvm.aarch64.hint(i32 4) [[ATTR3]]
+// AArch64-NEXT:    call void @llvm.aarch64.hint(i32 4) #[[ATTR3]]
 // AArch64-NEXT:    ret void
 //
 void test_sev(void) {
@@ -112,12 +112,12 @@ void test_sev(void) {
 
 // AArch32-LABEL: @test_sevl(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    call void @llvm.arm.hint(i32 5) [[ATTR1]]
+// AArch32-NEXT:    call void @llvm.arm.hint(i32 5) #[[ATTR1]]
 // AArch32-NEXT:    ret void
 //
 // AArch64-LABEL: @test_sevl(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    call void @llvm.aarch64.hint(i32 5) [[ATTR3]]
+// AArch64-NEXT:    call void @llvm.aarch64.hint(i32 5) #[[ATTR3]]
 // AArch64-NEXT:    ret void
 //
 void test_sevl(void) {
@@ -141,10 +141,10 @@ void test_dbg(void) {
 // AArch32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[P:%.*]] to i32*
 // AArch32-NEXT:    br label [[DO_BODY_I:%.*]]
 // AArch32:       do.body.i:
-// AArch32-NEXT:    [[LDREX_I:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* elementtype(i32) [[TMP0]]) [[ATTR1]]
-// AArch32-NEXT:    [[STREX_I:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 [[X:%.*]], i32* elementtype(i32) [[TMP0]]) [[ATTR1]]
+// AArch32-NEXT:    [[LDREX_I:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* elementtype(i32) [[TMP0]]) #[[ATTR1]]
+// AArch32-NEXT:    [[STREX_I:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 [[X:%.*]], i32* elementtype(i32) [[TMP0]]) #[[ATTR1]]
 // AArch32-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i32 [[STREX_I]], 0
-// AArch32-NEXT:    br i1 [[TOBOOL_I]], label [[DO_BODY_I]], label [[__SWP_EXIT:%.*]], [[LOOP3:!llvm.loop !.*]]
+// AArch32-NEXT:    br i1 [[TOBOOL_I]], label [[DO_BODY_I]], label [[__SWP_EXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
 // AArch32:       __swp.exit:
 // AArch32-NEXT:    ret void
 //
@@ -153,12 +153,12 @@ void test_dbg(void) {
 // AArch64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[P:%.*]] to i32*
 // AArch64-NEXT:    br label [[DO_BODY_I:%.*]]
 // AArch64:       do.body.i:
-// AArch64-NEXT:    [[LDXR_I:%.*]] = call i64 @llvm.aarch64.ldxr.p0i32(i32* elementtype(i32) [[TMP0]]) [[ATTR3]]
+// AArch64-NEXT:    [[LDXR_I:%.*]] = call i64 @llvm.aarch64.ldxr.p0i32(i32* elementtype(i32) [[TMP0]]) #[[ATTR3]]
 // AArch64-NEXT:    [[TMP1:%.*]] = trunc i64 [[LDXR_I]] to i32
 // AArch64-NEXT:    [[TMP2:%.*]] = zext i32 [[X:%.*]] to i64
-// AArch64-NEXT:    [[STXR_I:%.*]] = call i32 @llvm.aarch64.stxr.p0i32(i64 [[TMP2]], i32* elementtype(i32) [[TMP0]]) [[ATTR3]]
+// AArch64-NEXT:    [[STXR_I:%.*]] = call i32 @llvm.aarch64.stxr.p0i32(i64 [[TMP2]], i32* elementtype(i32) [[TMP0]]) #[[ATTR3]]
 // AArch64-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i32 [[STXR_I]], 0
-// AArch64-NEXT:    br i1 [[TOBOOL_I]], label [[DO_BODY_I]], label [[__SWP_EXIT:%.*]], [[LOOP6:!llvm.loop !.*]]
+// AArch64-NEXT:    br i1 [[TOBOOL_I]], label [[DO_BODY_I]], label [[__SWP_EXIT:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
 // AArch64:       __swp.exit:
 // AArch64-NEXT:    ret void
 //
@@ -218,12 +218,12 @@ void test_plix() {
 /* 8.7 NOP */
 // AArch32-LABEL: @test_nop(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    call void @llvm.arm.hint(i32 0) [[ATTR1]]
+// AArch32-NEXT:    call void @llvm.arm.hint(i32 0) #[[ATTR1]]
 // AArch32-NEXT:    ret void
 //
 // AArch64-LABEL: @test_nop(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    call void @llvm.aarch64.hint(i32 0) [[ATTR3]]
+// AArch64-NEXT:    call void @llvm.aarch64.hint(i32 0) #[[ATTR3]]
 // AArch64-NEXT:    ret void
 //
 void test_nop(void) {
@@ -319,12 +319,12 @@ uint64_t test_rorll(uint64_t x, uint32_t y) {
 
 // AArch32-LABEL: @test_clz(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[T:%.*]], i1 false) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[T:%.*]], i1 false) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 // AArch64-LABEL: @test_clz(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[T:%.*]], i1 false) [[ATTR3]]
+// AArch64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[T:%.*]], i1 false) #[[ATTR3]]
 // AArch64-NEXT:    ret i32 [[TMP0]]
 //
 uint32_t test_clz(uint32_t t) {
@@ -333,12 +333,12 @@ uint32_t test_clz(uint32_t t) {
 
 // AArch32-LABEL: @test_clzl(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[T:%.*]], i1 false) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[T:%.*]], i1 false) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 // AArch64-LABEL: @test_clzl(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[T:%.*]], i1 false) [[ATTR3]]
+// AArch64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[T:%.*]], i1 false) #[[ATTR3]]
 // AArch64-NEXT:    [[CAST_I:%.*]] = trunc i64 [[TMP0]] to i32
 // AArch64-NEXT:    [[CONV_I:%.*]] = sext i32 [[CAST_I]] to i64
 // AArch64-NEXT:    ret i64 [[CONV_I]]
@@ -349,14 +349,14 @@ long test_clzl(long t) {
 
 // AArch32-LABEL: @test_clzll(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[T:%.*]], i1 false) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[T:%.*]], i1 false) #[[ATTR1]]
 // AArch32-NEXT:    [[CAST_I:%.*]] = trunc i64 [[TMP0]] to i32
 // AArch32-NEXT:    [[CONV_I:%.*]] = sext i32 [[CAST_I]] to i64
 // AArch32-NEXT:    ret i64 [[CONV_I]]
 //
 // AArch64-LABEL: @test_clzll(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[T:%.*]], i1 false) [[ATTR3]]
+// AArch64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[T:%.*]], i1 false) #[[ATTR3]]
 // AArch64-NEXT:    [[CAST_I:%.*]] = trunc i64 [[TMP0]] to i32
 // AArch64-NEXT:    [[CONV_I:%.*]] = sext i32 [[CAST_I]] to i64
 // AArch64-NEXT:    ret i64 [[CONV_I]]
@@ -367,12 +367,12 @@ uint64_t test_clzll(uint64_t t) {
 
 // AArch32-LABEL: @test_cls(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[CLS_I:%.*]] = call i32 @llvm.arm.cls(i32 [[T:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[CLS_I:%.*]] = call i32 @llvm.arm.cls(i32 [[T:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[CLS_I]]
 //
 // AArch64-LABEL: @test_cls(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    [[CLS_I:%.*]] = call i32 @llvm.aarch64.cls(i32 [[T:%.*]]) [[ATTR3]]
+// AArch64-NEXT:    [[CLS_I:%.*]] = call i32 @llvm.aarch64.cls(i32 [[T:%.*]]) #[[ATTR3]]
 // AArch64-NEXT:    ret i32 [[CLS_I]]
 //
 unsigned test_cls(uint32_t t) {
@@ -381,12 +381,12 @@ unsigned test_cls(uint32_t t) {
 
 // AArch32-LABEL: @test_clsl(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[CLS_I:%.*]] = call i32 @llvm.arm.cls(i32 [[T:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[CLS_I:%.*]] = call i32 @llvm.arm.cls(i32 [[T:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[CLS_I]]
 //
 // AArch64-LABEL: @test_clsl(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    [[CLS_I:%.*]] = call i32 @llvm.aarch64.cls64(i64 [[T:%.*]]) [[ATTR3]]
+// AArch64-NEXT:    [[CLS_I:%.*]] = call i32 @llvm.aarch64.cls64(i64 [[T:%.*]]) #[[ATTR3]]
 // AArch64-NEXT:    ret i32 [[CLS_I]]
 //
 unsigned test_clsl(unsigned long t) {
@@ -395,12 +395,12 @@ unsigned test_clsl(unsigned long t) {
 
 // AArch32-LABEL: @test_clsll(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[CLS_I:%.*]] = call i32 @llvm.arm.cls64(i64 [[T:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[CLS_I:%.*]] = call i32 @llvm.arm.cls64(i64 [[T:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[CLS_I]]
 //
 // AArch64-LABEL: @test_clsll(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    [[CLS_I:%.*]] = call i32 @llvm.aarch64.cls64(i64 [[T:%.*]]) [[ATTR3]]
+// AArch64-NEXT:    [[CLS_I:%.*]] = call i32 @llvm.aarch64.cls64(i64 [[T:%.*]]) #[[ATTR3]]
 // AArch64-NEXT:    ret i32 [[CLS_I]]
 //
 unsigned test_clsll(uint64_t t) {
@@ -409,12 +409,12 @@ unsigned test_clsll(uint64_t t) {
 
 // AArch32-LABEL: @test_rev(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 // AArch64-LABEL: @test_rev(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) [[ATTR3]]
+// AArch64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) #[[ATTR3]]
 // AArch64-NEXT:    ret i32 [[TMP0]]
 //
 uint32_t test_rev(uint32_t t) {
@@ -423,12 +423,12 @@ uint32_t test_rev(uint32_t t) {
 
 // AArch32-LABEL: @test_revl(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 // AArch64-LABEL: @test_revl(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[T:%.*]]) [[ATTR3]]
+// AArch64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[T:%.*]]) #[[ATTR3]]
 // AArch64-NEXT:    ret i64 [[TMP0]]
 //
 long test_revl(long t) {
@@ -437,12 +437,12 @@ long test_revl(long t) {
 
 // AArch32-LABEL: @test_revll(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[T:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[T:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i64 [[TMP0]]
 //
 // AArch64-LABEL: @test_revll(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[T:%.*]]) [[ATTR3]]
+// AArch64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[T:%.*]]) #[[ATTR3]]
 // AArch64-NEXT:    ret i64 [[TMP0]]
 //
 uint64_t test_revll(uint64_t t) {
@@ -451,7 +451,7 @@ uint64_t test_revll(uint64_t t) {
 
 // AArch32-LABEL: @test_rev16(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    [[REM_I_I:%.*]] = urem i32 16, 32
 // AArch32-NEXT:    [[CMP_I_I:%.*]] = icmp eq i32 [[REM_I_I]], 0
 // AArch32-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_END_I_I:%.*]]
@@ -469,7 +469,7 @@ uint64_t test_revll(uint64_t t) {
 //
 // AArch64-LABEL: @test_rev16(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) [[ATTR3]]
+// AArch64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) #[[ATTR3]]
 // AArch64-NEXT:    [[REM_I_I:%.*]] = urem i32 16, 32
 // AArch64-NEXT:    [[CMP_I_I:%.*]] = icmp eq i32 [[REM_I_I]], 0
 // AArch64-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_END_I_I:%.*]]
@@ -491,7 +491,7 @@ uint32_t test_rev16(uint32_t t) {
 
 // AArch32-LABEL: @test_rev16l(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    [[REM_I_I_I:%.*]] = urem i32 16, 32
 // AArch32-NEXT:    [[CMP_I_I_I:%.*]] = icmp eq i32 [[REM_I_I_I]], 0
 // AArch32-NEXT:    br i1 [[CMP_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[IF_END_I_I_I:%.*]]
@@ -511,7 +511,7 @@ uint32_t test_rev16(uint32_t t) {
 // AArch64-NEXT:  entry:
 // AArch64-NEXT:    [[SHR_I:%.*]] = lshr i64 [[T:%.*]], 32
 // AArch64-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SHR_I]] to i32
-// AArch64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV_I]]) [[ATTR3]]
+// AArch64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV_I]]) #[[ATTR3]]
 // AArch64-NEXT:    [[REM_I_I10_I:%.*]] = urem i32 16, 32
 // AArch64-NEXT:    [[CMP_I_I11_I:%.*]] = icmp eq i32 [[REM_I_I10_I]], 0
 // AArch64-NEXT:    br i1 [[CMP_I_I11_I]], label [[IF_THEN_I_I12_I:%.*]], label [[IF_END_I_I17_I:%.*]]
@@ -528,7 +528,7 @@ uint32_t test_rev16(uint32_t t) {
 // AArch64-NEXT:    [[CONV1_I:%.*]] = zext i32 [[RETVAL_I_I6_I_0]] to i64
 // AArch64-NEXT:    [[SHL_I:%.*]] = shl i64 [[CONV1_I]], 32
 // AArch64-NEXT:    [[CONV2_I:%.*]] = trunc i64 [[T]] to i32
-// AArch64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV2_I]]) [[ATTR3]]
+// AArch64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV2_I]]) #[[ATTR3]]
 // AArch64-NEXT:    [[REM_I_I_I:%.*]] = urem i32 16, 32
 // AArch64-NEXT:    [[CMP_I_I_I:%.*]] = icmp eq i32 [[REM_I_I_I]], 0
 // AArch64-NEXT:    br i1 [[CMP_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[IF_END_I_I_I:%.*]]
@@ -554,7 +554,7 @@ long test_rev16l(long t) {
 // AArch32-NEXT:  entry:
 // AArch32-NEXT:    [[SHR_I:%.*]] = lshr i64 [[T:%.*]], 32
 // AArch32-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SHR_I]] to i32
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV_I]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV_I]]) #[[ATTR1]]
 // AArch32-NEXT:    [[REM_I_I10_I:%.*]] = urem i32 16, 32
 // AArch32-NEXT:    [[CMP_I_I11_I:%.*]] = icmp eq i32 [[REM_I_I10_I]], 0
 // AArch32-NEXT:    br i1 [[CMP_I_I11_I]], label [[IF_THEN_I_I12_I:%.*]], label [[IF_END_I_I17_I:%.*]]
@@ -571,7 +571,7 @@ long test_rev16l(long t) {
 // AArch32-NEXT:    [[CONV1_I:%.*]] = zext i32 [[RETVAL_I_I6_I_0]] to i64
 // AArch32-NEXT:    [[SHL_I:%.*]] = shl i64 [[CONV1_I]], 32
 // AArch32-NEXT:    [[CONV2_I:%.*]] = trunc i64 [[T]] to i32
-// AArch32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV2_I]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV2_I]]) #[[ATTR1]]
 // AArch32-NEXT:    [[REM_I_I_I:%.*]] = urem i32 16, 32
 // AArch32-NEXT:    [[CMP_I_I_I:%.*]] = icmp eq i32 [[REM_I_I_I]], 0
 // AArch32-NEXT:    br i1 [[CMP_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[IF_END_I_I_I:%.*]]
@@ -593,7 +593,7 @@ long test_rev16l(long t) {
 // AArch64-NEXT:  entry:
 // AArch64-NEXT:    [[SHR_I:%.*]] = lshr i64 [[T:%.*]], 32
 // AArch64-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SHR_I]] to i32
-// AArch64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV_I]]) [[ATTR3]]
+// AArch64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV_I]]) #[[ATTR3]]
 // AArch64-NEXT:    [[REM_I_I10_I:%.*]] = urem i32 16, 32
 // AArch64-NEXT:    [[CMP_I_I11_I:%.*]] = icmp eq i32 [[REM_I_I10_I]], 0
 // AArch64-NEXT:    br i1 [[CMP_I_I11_I]], label [[IF_THEN_I_I12_I:%.*]], label [[IF_END_I_I17_I:%.*]]
@@ -610,7 +610,7 @@ long test_rev16l(long t) {
 // AArch64-NEXT:    [[CONV1_I:%.*]] = zext i32 [[RETVAL_I_I6_I_0]] to i64
 // AArch64-NEXT:    [[SHL_I:%.*]] = shl i64 [[CONV1_I]], 32
 // AArch64-NEXT:    [[CONV2_I:%.*]] = trunc i64 [[T]] to i32
-// AArch64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV2_I]]) [[ATTR3]]
+// AArch64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV2_I]]) #[[ATTR3]]
 // AArch64-NEXT:    [[REM_I_I_I:%.*]] = urem i32 16, 32
 // AArch64-NEXT:    [[CMP_I_I_I:%.*]] = icmp eq i32 [[REM_I_I_I]], 0
 // AArch64-NEXT:    br i1 [[CMP_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[IF_END_I_I_I:%.*]]
@@ -634,12 +634,12 @@ uint64_t test_rev16ll(uint64_t t) {
 
 // AArch32-LABEL: @test_revsh(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i16 @llvm.bswap.i16(i16 [[T:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i16 @llvm.bswap.i16(i16 [[T:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i16 [[TMP0]]
 //
 // AArch64-LABEL: @test_revsh(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    [[TMP0:%.*]] = call i16 @llvm.bswap.i16(i16 [[T:%.*]]) [[ATTR3]]
+// AArch64-NEXT:    [[TMP0:%.*]] = call i16 @llvm.bswap.i16(i16 [[T:%.*]]) #[[ATTR3]]
 // AArch64-NEXT:    ret i16 [[TMP0]]
 //
 int16_t test_revsh(int16_t t) {
@@ -648,12 +648,12 @@ int16_t test_revsh(int16_t t) {
 
 // AArch32-LABEL: @test_rbit(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[RBIT_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[T:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[RBIT_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[T:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[RBIT_I]]
 //
 // AArch64-LABEL: @test_rbit(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    [[RBIT_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[T:%.*]]) [[ATTR3]]
+// AArch64-NEXT:    [[RBIT_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[T:%.*]]) #[[ATTR3]]
 // AArch64-NEXT:    ret i32 [[RBIT_I]]
 //
 uint32_t test_rbit(uint32_t t) {
@@ -662,12 +662,12 @@ uint32_t test_rbit(uint32_t t) {
 
 // AArch32-LABEL: @test_rbitl(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[RBIT_I_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[T:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[RBIT_I_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[T:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[RBIT_I_I]]
 //
 // AArch64-LABEL: @test_rbitl(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    [[RBIT_I:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[T:%.*]]) [[ATTR3]]
+// AArch64-NEXT:    [[RBIT_I:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[T:%.*]]) #[[ATTR3]]
 // AArch64-NEXT:    ret i64 [[RBIT_I]]
 //
 long test_rbitl(long t) {
@@ -677,19 +677,19 @@ long test_rbitl(long t) {
 // AArch32-LABEL: @test_rbitll(
 // AArch32-NEXT:  entry:
 // AArch32-NEXT:    [[CONV_I:%.*]] = trunc i64 [[T:%.*]] to i32
-// AArch32-NEXT:    [[RBIT_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[CONV_I]]) [[ATTR1]]
+// AArch32-NEXT:    [[RBIT_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[CONV_I]]) #[[ATTR1]]
 // AArch32-NEXT:    [[CONV1_I:%.*]] = zext i32 [[RBIT_I]] to i64
 // AArch32-NEXT:    [[SHL_I:%.*]] = shl i64 [[CONV1_I]], 32
 // AArch32-NEXT:    [[SHR_I:%.*]] = lshr i64 [[T]], 32
 // AArch32-NEXT:    [[CONV2_I:%.*]] = trunc i64 [[SHR_I]] to i32
-// AArch32-NEXT:    [[RBIT3_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[CONV2_I]]) [[ATTR1]]
+// AArch32-NEXT:    [[RBIT3_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[CONV2_I]]) #[[ATTR1]]
 // AArch32-NEXT:    [[CONV4_I:%.*]] = zext i32 [[RBIT3_I]] to i64
 // AArch32-NEXT:    [[OR_I:%.*]] = or i64 [[SHL_I]], [[CONV4_I]]
 // AArch32-NEXT:    ret i64 [[OR_I]]
 //
 // AArch64-LABEL: @test_rbitll(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    [[RBIT_I:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[T:%.*]]) [[ATTR3]]
+// AArch64-NEXT:    [[RBIT_I:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[T:%.*]]) #[[ATTR3]]
 // AArch64-NEXT:    ret i64 [[RBIT_I]]
 //
 uint64_t test_rbitll(uint64_t t) {
@@ -722,7 +722,7 @@ uint32_t test_usat(int32_t t) {
 #ifdef __ARM_FEATURE_DSP
 // AArch32-LABEL: @test_qadd(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.qadd(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.qadd(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_qadd(int32_t a, int32_t b) {
@@ -731,7 +731,7 @@ int32_t test_qadd(int32_t a, int32_t b) {
 
 // AArch32-LABEL: @test_qsub(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.qsub(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.qsub(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_qsub(int32_t a, int32_t b) {
@@ -741,8 +741,8 @@ int32_t test_qsub(int32_t a, int32_t b) {
 extern int32_t f();
 // AArch32-LABEL: @test_qdbl(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[CALL:%.*]] = call i32 bitcast (i32 (...)* @f to i32 ()*)() [[ATTR7:#.*]]
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.qadd(i32 [[CALL]], i32 [[CALL]]) [[ATTR1]]
+// AArch32-NEXT:    [[CALL:%.*]] = call i32 bitcast (i32 (...)* @f to i32 ()*)() #[[ATTR7:[0-9]+]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.qadd(i32 [[CALL]], i32 [[CALL]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_qdbl() {
@@ -756,7 +756,7 @@ int32_t test_qdbl() {
 #if __ARM_FEATURE_DSP
 // AArch32-LABEL: @test_smulbb(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smulbb(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smulbb(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_smulbb(int32_t a, int32_t b) {
@@ -765,7 +765,7 @@ int32_t test_smulbb(int32_t a, int32_t b) {
 
 // AArch32-LABEL: @test_smulbt(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smulbt(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smulbt(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_smulbt(int32_t a, int32_t b) {
@@ -774,7 +774,7 @@ int32_t test_smulbt(int32_t a, int32_t b) {
 
 // AArch32-LABEL: @test_smultb(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smultb(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smultb(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_smultb(int32_t a, int32_t b) {
@@ -783,7 +783,7 @@ int32_t test_smultb(int32_t a, int32_t b) {
 
 // AArch32-LABEL: @test_smultt(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smultt(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smultt(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_smultt(int32_t a, int32_t b) {
@@ -792,7 +792,7 @@ int32_t test_smultt(int32_t a, int32_t b) {
 
 // AArch32-LABEL: @test_smulwb(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smulwb(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smulwb(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_smulwb(int32_t a, int32_t b) {
@@ -801,7 +801,7 @@ int32_t test_smulwb(int32_t a, int32_t b) {
 
 // AArch32-LABEL: @test_smulwt(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smulwt(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smulwt(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_smulwt(int32_t a, int32_t b) {
@@ -813,7 +813,7 @@ int32_t test_smulwt(int32_t a, int32_t b) {
 #if __ARM_FEATURE_DSP
 // AArch32-LABEL: @test_smlabb(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlabb(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlabb(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_smlabb(int32_t a, int32_t b, int32_t c) {
@@ -822,7 +822,7 @@ int32_t test_smlabb(int32_t a, int32_t b, int32_t c) {
 
 // AArch32-LABEL: @test_smlabt(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlabt(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlabt(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_smlabt(int32_t a, int32_t b, int32_t c) {
@@ -831,7 +831,7 @@ int32_t test_smlabt(int32_t a, int32_t b, int32_t c) {
 
 // AArch32-LABEL: @test_smlatb(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlatb(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlatb(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_smlatb(int32_t a, int32_t b, int32_t c) {
@@ -840,7 +840,7 @@ int32_t test_smlatb(int32_t a, int32_t b, int32_t c) {
 
 // AArch32-LABEL: @test_smlatt(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlatt(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlatt(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_smlatt(int32_t a, int32_t b, int32_t c) {
@@ -849,7 +849,7 @@ int32_t test_smlatt(int32_t a, int32_t b, int32_t c) {
 
 // AArch32-LABEL: @test_smlawb(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlawb(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlawb(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_smlawb(int32_t a, int32_t b, int32_t c) {
@@ -858,7 +858,7 @@ int32_t test_smlawb(int32_t a, int32_t b, int32_t c) {
 
 // AArch32-LABEL: @test_smlawt(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlawt(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlawt(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_smlawt(int32_t a, int32_t b, int32_t c) {
@@ -891,7 +891,7 @@ uint16x2_t test_usat16(int16x2_t a) {
 #if __ARM_FEATURE_SIMD32
 // AArch32-LABEL: @test_sxtab16(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.sxtab16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.sxtab16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int16x2_t test_sxtab16(int16x2_t a, int8x4_t b) {
@@ -900,7 +900,7 @@ int16x2_t test_sxtab16(int16x2_t a, int8x4_t b) {
 
 // AArch32-LABEL: @test_sxtb16(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.sxtb16(i32 [[A:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.sxtb16(i32 [[A:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int16x2_t test_sxtb16(int8x4_t a) {
@@ -909,7 +909,7 @@ int16x2_t test_sxtb16(int8x4_t a) {
 
 // AArch32-LABEL: @test_uxtab16(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uxtab16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uxtab16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int16x2_t test_uxtab16(int16x2_t a, int8x4_t b) {
@@ -918,7 +918,7 @@ int16x2_t test_uxtab16(int16x2_t a, int8x4_t b) {
 
 // AArch32-LABEL: @test_uxtb16(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uxtb16(i32 [[A:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uxtb16(i32 [[A:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int16x2_t test_uxtb16(int8x4_t a) {
@@ -930,7 +930,7 @@ int16x2_t test_uxtb16(int8x4_t a) {
 #if __ARM_FEATURE_SIMD32
 // AArch32-LABEL: @test_sel(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.sel(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.sel(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 uint8x4_t test_sel(uint8x4_t a, uint8x4_t b) {
@@ -942,7 +942,7 @@ uint8x4_t test_sel(uint8x4_t a, uint8x4_t b) {
 #if __ARM_FEATURE_SIMD32
 // AArch32-LABEL: @test_qadd8(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.qadd8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.qadd8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int16x2_t test_qadd8(int8x4_t a, int8x4_t b) {
@@ -951,7 +951,7 @@ int16x2_t test_qadd8(int8x4_t a, int8x4_t b) {
 
 // AArch32-LABEL: @test_qsub8(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.qsub8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.qsub8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int8x4_t test_qsub8(int8x4_t a, int8x4_t b) {
@@ -960,7 +960,7 @@ int8x4_t test_qsub8(int8x4_t a, int8x4_t b) {
 
 // AArch32-LABEL: @test_sadd8(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.sadd8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.sadd8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int8x4_t test_sadd8(int8x4_t a, int8x4_t b) {
@@ -969,7 +969,7 @@ int8x4_t test_sadd8(int8x4_t a, int8x4_t b) {
 
 // AArch32-LABEL: @test_shadd8(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.shadd8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.shadd8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int8x4_t test_shadd8(int8x4_t a, int8x4_t b) {
@@ -978,7 +978,7 @@ int8x4_t test_shadd8(int8x4_t a, int8x4_t b) {
 
 // AArch32-LABEL: @test_shsub8(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.shsub8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.shsub8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int8x4_t test_shsub8(int8x4_t a, int8x4_t b) {
@@ -987,7 +987,7 @@ int8x4_t test_shsub8(int8x4_t a, int8x4_t b) {
 
 // AArch32-LABEL: @test_ssub8(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.ssub8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.ssub8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int8x4_t test_ssub8(int8x4_t a, int8x4_t b) {
@@ -996,7 +996,7 @@ int8x4_t test_ssub8(int8x4_t a, int8x4_t b) {
 
 // AArch32-LABEL: @test_uadd8(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uadd8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uadd8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 uint8x4_t test_uadd8(uint8x4_t a, uint8x4_t b) {
@@ -1005,7 +1005,7 @@ uint8x4_t test_uadd8(uint8x4_t a, uint8x4_t b) {
 
 // AArch32-LABEL: @test_uhadd8(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uhadd8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uhadd8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 uint8x4_t test_uhadd8(uint8x4_t a, uint8x4_t b) {
@@ -1014,7 +1014,7 @@ uint8x4_t test_uhadd8(uint8x4_t a, uint8x4_t b) {
 
 // AArch32-LABEL: @test_uhsub8(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uhsub8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uhsub8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 uint8x4_t test_uhsub8(uint8x4_t a, uint8x4_t b) {
@@ -1023,7 +1023,7 @@ uint8x4_t test_uhsub8(uint8x4_t a, uint8x4_t b) {
 
 // AArch32-LABEL: @test_uqadd8(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uqadd8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uqadd8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 uint8x4_t test_uqadd8(uint8x4_t a, uint8x4_t b) {
@@ -1032,7 +1032,7 @@ uint8x4_t test_uqadd8(uint8x4_t a, uint8x4_t b) {
 
 // AArch32-LABEL: @test_uqsub8(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uqsub8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uqsub8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 uint8x4_t test_uqsub8(uint8x4_t a, uint8x4_t b) {
@@ -1041,7 +1041,7 @@ uint8x4_t test_uqsub8(uint8x4_t a, uint8x4_t b) {
 
 // AArch32-LABEL: @test_usub8(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.usub8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.usub8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 uint8x4_t test_usub8(uint8x4_t a, uint8x4_t b) {
@@ -1053,7 +1053,7 @@ uint8x4_t test_usub8(uint8x4_t a, uint8x4_t b) {
 #if __ARM_FEATURE_SIMD32
 // AArch32-LABEL: @test_usad8(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.usad8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.usad8(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 uint32_t test_usad8(uint8x4_t a, uint8x4_t b) {
@@ -1065,7 +1065,7 @@ uint32_t test_usad8(uint8x4_t a, uint8x4_t b) {
 // AArch32-NEXT:    [[CONV:%.*]] = zext i8 [[A:%.*]] to i32
 // AArch32-NEXT:    [[CONV1:%.*]] = zext i8 [[B:%.*]] to i32
 // AArch32-NEXT:    [[CONV2:%.*]] = zext i8 [[C:%.*]] to i32
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.usada8(i32 [[CONV]], i32 [[CONV1]], i32 [[CONV2]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.usada8(i32 [[CONV]], i32 [[CONV1]], i32 [[CONV2]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 uint32_t test_usada8(uint8_t a, uint8_t b, uint8_t c) {
@@ -1077,7 +1077,7 @@ uint32_t test_usada8(uint8_t a, uint8_t b, uint8_t c) {
 #if __ARM_FEATURE_SIMD32
 // AArch32-LABEL: @test_qadd16(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.qadd16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.qadd16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int16x2_t test_qadd16(int16x2_t a, int16x2_t b) {
@@ -1086,7 +1086,7 @@ int16x2_t test_qadd16(int16x2_t a, int16x2_t b) {
 
 // AArch32-LABEL: @test_qasx(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.qasx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.qasx(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int16x2_t test_qasx(int16x2_t a, int16x2_t b) {
@@ -1095,7 +1095,7 @@ int16x2_t test_qasx(int16x2_t a, int16x2_t b) {
 
 // AArch32-LABEL: @test_qsax(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.qsax(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.qsax(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int16x2_t test_qsax(int16x2_t a, int16x2_t b) {
@@ -1104,7 +1104,7 @@ int16x2_t test_qsax(int16x2_t a, int16x2_t b) {
 
 // AArch32-LABEL: @test_qsub16(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.qsub16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.qsub16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int16x2_t test_qsub16(int16x2_t a, int16x2_t b) {
@@ -1113,7 +1113,7 @@ int16x2_t test_qsub16(int16x2_t a, int16x2_t b) {
 
 // AArch32-LABEL: @test_sadd16(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.sadd16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.sadd16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int16x2_t test_sadd16(int16x2_t a, int16x2_t b) {
@@ -1122,7 +1122,7 @@ int16x2_t test_sadd16(int16x2_t a, int16x2_t b) {
 
 // AArch32-LABEL: @test_sasx(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.sasx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.sasx(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int16x2_t test_sasx(int16x2_t a, int16x2_t b) {
@@ -1131,7 +1131,7 @@ int16x2_t test_sasx(int16x2_t a, int16x2_t b) {
 
 // AArch32-LABEL: @test_shadd16(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.shadd16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.shadd16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int16x2_t test_shadd16(int16x2_t a, int16x2_t b) {
@@ -1140,7 +1140,7 @@ int16x2_t test_shadd16(int16x2_t a, int16x2_t b) {
 
 // AArch32-LABEL: @test_shasx(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.shasx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.shasx(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int16x2_t test_shasx(int16x2_t a, int16x2_t b) {
@@ -1149,7 +1149,7 @@ int16x2_t test_shasx(int16x2_t a, int16x2_t b) {
 
 // AArch32-LABEL: @test_shsax(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.shsax(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.shsax(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int16x2_t test_shsax(int16x2_t a, int16x2_t b) {
@@ -1158,7 +1158,7 @@ int16x2_t test_shsax(int16x2_t a, int16x2_t b) {
 
 // AArch32-LABEL: @test_shsub16(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.shsub16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.shsub16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int16x2_t test_shsub16(int16x2_t a, int16x2_t b) {
@@ -1167,7 +1167,7 @@ int16x2_t test_shsub16(int16x2_t a, int16x2_t b) {
 
 // AArch32-LABEL: @test_ssax(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.ssax(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.ssax(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int16x2_t test_ssax(int16x2_t a, int16x2_t b) {
@@ -1176,7 +1176,7 @@ int16x2_t test_ssax(int16x2_t a, int16x2_t b) {
 
 // AArch32-LABEL: @test_ssub16(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.ssub16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.ssub16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int16x2_t test_ssub16(int16x2_t a, int16x2_t b) {
@@ -1185,7 +1185,7 @@ int16x2_t test_ssub16(int16x2_t a, int16x2_t b) {
 
 // AArch32-LABEL: @test_uadd16(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uadd16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uadd16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 uint16x2_t test_uadd16(uint16x2_t a, uint16x2_t b) {
@@ -1194,7 +1194,7 @@ uint16x2_t test_uadd16(uint16x2_t a, uint16x2_t b) {
 
 // AArch32-LABEL: @test_uasx(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uasx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uasx(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 uint16x2_t test_uasx(uint16x2_t a, uint16x2_t b) {
@@ -1203,7 +1203,7 @@ uint16x2_t test_uasx(uint16x2_t a, uint16x2_t b) {
 
 // AArch32-LABEL: @test_uhadd16(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uhadd16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uhadd16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 uint16x2_t test_uhadd16(uint16x2_t a, uint16x2_t b) {
@@ -1212,7 +1212,7 @@ uint16x2_t test_uhadd16(uint16x2_t a, uint16x2_t b) {
 
 // AArch32-LABEL: @test_uhasx(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uhasx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uhasx(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 uint16x2_t test_uhasx(uint16x2_t a, uint16x2_t b) {
@@ -1221,7 +1221,7 @@ uint16x2_t test_uhasx(uint16x2_t a, uint16x2_t b) {
 
 // AArch32-LABEL: @test_uhsax(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uhsax(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uhsax(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 uint16x2_t test_uhsax(uint16x2_t a, uint16x2_t b) {
@@ -1230,7 +1230,7 @@ uint16x2_t test_uhsax(uint16x2_t a, uint16x2_t b) {
 
 // AArch32-LABEL: @test_uhsub16(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uhsub16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uhsub16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 uint16x2_t test_uhsub16(uint16x2_t a, uint16x2_t b) {
@@ -1239,7 +1239,7 @@ uint16x2_t test_uhsub16(uint16x2_t a, uint16x2_t b) {
 
 // AArch32-LABEL: @test_uqadd16(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uqadd16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uqadd16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 uint16x2_t test_uqadd16(uint16x2_t a, uint16x2_t b) {
@@ -1248,7 +1248,7 @@ uint16x2_t test_uqadd16(uint16x2_t a, uint16x2_t b) {
 
 // AArch32-LABEL: @test_uqasx(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uqasx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uqasx(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 uint16x2_t test_uqasx(uint16x2_t a, uint16x2_t b) {
@@ -1257,7 +1257,7 @@ uint16x2_t test_uqasx(uint16x2_t a, uint16x2_t b) {
 
 // AArch32-LABEL: @test_uqsax(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uqsax(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uqsax(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 uint16x2_t test_uqsax(uint16x2_t a, uint16x2_t b) {
@@ -1266,7 +1266,7 @@ uint16x2_t test_uqsax(uint16x2_t a, uint16x2_t b) {
 
 // AArch32-LABEL: @test_uqsub16(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uqsub16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.uqsub16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 uint16x2_t test_uqsub16(uint16x2_t a, uint16x2_t b) {
@@ -1275,7 +1275,7 @@ uint16x2_t test_uqsub16(uint16x2_t a, uint16x2_t b) {
 
 // AArch32-LABEL: @test_usax(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.usax(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.usax(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 uint16x2_t test_usax(uint16x2_t a, uint16x2_t b) {
@@ -1284,7 +1284,7 @@ uint16x2_t test_usax(uint16x2_t a, uint16x2_t b) {
 
 // AArch32-LABEL: @test_usub16(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.usub16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.usub16(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 uint16x2_t test_usub16(uint16x2_t a, uint16x2_t b) {
@@ -1296,7 +1296,7 @@ uint16x2_t test_usub16(uint16x2_t a, uint16x2_t b) {
 #if __ARM_FEATURE_SIMD32
 // AArch32-LABEL: @test_smlad(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlad(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlad(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_smlad(int16x2_t a, int16x2_t b, int32_t c) {
@@ -1305,7 +1305,7 @@ int32_t test_smlad(int16x2_t a, int16x2_t b, int32_t c) {
 
 // AArch32-LABEL: @test_smladx(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smladx(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smladx(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_smladx(int16x2_t a, int16x2_t b, int32_t c) {
@@ -1314,7 +1314,7 @@ int32_t test_smladx(int16x2_t a, int16x2_t b, int32_t c) {
 
 // AArch32-LABEL: @test_smlald(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.arm.smlald(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.arm.smlald(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i64 [[TMP0]]
 //
 int64_t test_smlald(int16x2_t a, int16x2_t b, int64_t c) {
@@ -1323,7 +1323,7 @@ int64_t test_smlald(int16x2_t a, int16x2_t b, int64_t c) {
 
 // AArch32-LABEL: @test_smlaldx(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i64 [[TMP0]]
 //
 int64_t test_smlaldx(int16x2_t a, int16x2_t b, int64_t c) {
@@ -1332,7 +1332,7 @@ int64_t test_smlaldx(int16x2_t a, int16x2_t b, int64_t c) {
 
 // AArch32-LABEL: @test_smlsd(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlsd(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlsd(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_smlsd(int16x2_t a, int16x2_t b, int32_t c) {
@@ -1341,7 +1341,7 @@ int32_t test_smlsd(int16x2_t a, int16x2_t b, int32_t c) {
 
 // AArch32-LABEL: @test_smlsdx(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlsdx(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlsdx(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_smlsdx(int16x2_t a, int16x2_t b, int32_t c) {
@@ -1350,7 +1350,7 @@ int32_t test_smlsdx(int16x2_t a, int16x2_t b, int32_t c) {
 
 // AArch32-LABEL: @test_smlsld(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.arm.smlsld(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.arm.smlsld(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i64 [[TMP0]]
 //
 int64_t test_smlsld(int16x2_t a, int16x2_t b, int64_t c) {
@@ -1359,7 +1359,7 @@ int64_t test_smlsld(int16x2_t a, int16x2_t b, int64_t c) {
 
 // AArch32-LABEL: @test_smlsldx(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.arm.smlsldx(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.arm.smlsldx(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i64 [[TMP0]]
 //
 int64_t test_smlsldx(int16x2_t a, int16x2_t b, int64_t c) {
@@ -1368,7 +1368,7 @@ int64_t test_smlsldx(int16x2_t a, int16x2_t b, int64_t c) {
 
 // AArch32-LABEL: @test_smuad(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smuad(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smuad(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_smuad(int16x2_t a, int16x2_t b) {
@@ -1377,7 +1377,7 @@ int32_t test_smuad(int16x2_t a, int16x2_t b) {
 
 // AArch32-LABEL: @test_smuadx(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smuadx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smuadx(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_smuadx(int16x2_t a, int16x2_t b) {
@@ -1386,7 +1386,7 @@ int32_t test_smuadx(int16x2_t a, int16x2_t b) {
 
 // AArch32-LABEL: @test_smusd(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smusd(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smusd(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_smusd(int16x2_t a, int16x2_t b) {
@@ -1395,7 +1395,7 @@ int32_t test_smusd(int16x2_t a, int16x2_t b) {
 
 // AArch32-LABEL: @test_smusdx(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smusdx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smusdx(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_smusdx(int16x2_t a, int16x2_t b) {
@@ -1407,13 +1407,13 @@ int32_t test_smusdx(int16x2_t a, int16x2_t b) {
 // AArch32-LABEL: @test_crc32b(
 // AArch32-NEXT:  entry:
 // AArch32-NEXT:    [[TMP0:%.*]] = zext i8 [[B:%.*]] to i32
-// AArch32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.crc32b(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.crc32b(i32 [[A:%.*]], i32 [[TMP0]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP1]]
 //
 // AArch64-LABEL: @test_crc32b(
 // AArch64-NEXT:  entry:
 // AArch64-NEXT:    [[TMP0:%.*]] = zext i8 [[B:%.*]] to i32
-// AArch64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32b(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR3]]
+// AArch64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32b(i32 [[A:%.*]], i32 [[TMP0]]) #[[ATTR3]]
 // AArch64-NEXT:    ret i32 [[TMP1]]
 //
 uint32_t test_crc32b(uint32_t a, uint8_t b) {
@@ -1423,13 +1423,13 @@ uint32_t test_crc32b(uint32_t a, uint8_t b) {
 // AArch32-LABEL: @test_crc32h(
 // AArch32-NEXT:  entry:
 // AArch32-NEXT:    [[TMP0:%.*]] = zext i16 [[B:%.*]] to i32
-// AArch32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.crc32h(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.crc32h(i32 [[A:%.*]], i32 [[TMP0]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP1]]
 //
 // AArch64-LABEL: @test_crc32h(
 // AArch64-NEXT:  entry:
 // AArch64-NEXT:    [[TMP0:%.*]] = zext i16 [[B:%.*]] to i32
-// AArch64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32h(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR3]]
+// AArch64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32h(i32 [[A:%.*]], i32 [[TMP0]]) #[[ATTR3]]
 // AArch64-NEXT:    ret i32 [[TMP1]]
 //
 uint32_t test_crc32h(uint32_t a, uint16_t b) {
@@ -1438,12 +1438,12 @@ uint32_t test_crc32h(uint32_t a, uint16_t b) {
 
 // AArch32-LABEL: @test_crc32w(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.crc32w(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.crc32w(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 // AArch64-LABEL: @test_crc32w(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32w(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR3]]
+// AArch64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32w(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]]
 // AArch64-NEXT:    ret i32 [[TMP0]]
 //
 uint32_t test_crc32w(uint32_t a, uint32_t b) {
@@ -1455,13 +1455,13 @@ uint32_t test_crc32w(uint32_t a, uint32_t b) {
 // AArch32-NEXT:    [[TMP0:%.*]] = trunc i64 [[B:%.*]] to i32
 // AArch32-NEXT:    [[TMP1:%.*]] = lshr i64 [[B]], 32
 // AArch32-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
-// AArch32-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.crc32w(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR1]]
-// AArch32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.crc32w(i32 [[TMP3]], i32 [[TMP2]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.crc32w(i32 [[A:%.*]], i32 [[TMP0]]) #[[ATTR1]]
+// AArch32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.crc32w(i32 [[TMP3]], i32 [[TMP2]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP4]]
 //
 // AArch64-LABEL: @test_crc32d(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32x(i32 [[A:%.*]], i64 [[B:%.*]]) [[ATTR3]]
+// AArch64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32x(i32 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR3]]
 // AArch64-NEXT:    ret i32 [[TMP0]]
 //
 uint32_t test_crc32d(uint32_t a, uint64_t b) {
@@ -1471,13 +1471,13 @@ uint32_t test_crc32d(uint32_t a, uint64_t b) {
 // AArch32-LABEL: @test_crc32cb(
 // AArch32-NEXT:  entry:
 // AArch32-NEXT:    [[TMP0:%.*]] = zext i8 [[B:%.*]] to i32
-// AArch32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.crc32cb(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.crc32cb(i32 [[A:%.*]], i32 [[TMP0]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP1]]
 //
 // AArch64-LABEL: @test_crc32cb(
 // AArch64-NEXT:  entry:
 // AArch64-NEXT:    [[TMP0:%.*]] = zext i8 [[B:%.*]] to i32
-// AArch64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32cb(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR3]]
+// AArch64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32cb(i32 [[A:%.*]], i32 [[TMP0]]) #[[ATTR3]]
 // AArch64-NEXT:    ret i32 [[TMP1]]
 //
 uint32_t test_crc32cb(uint32_t a, uint8_t b) {
@@ -1487,13 +1487,13 @@ uint32_t test_crc32cb(uint32_t a, uint8_t b) {
 // AArch32-LABEL: @test_crc32ch(
 // AArch32-NEXT:  entry:
 // AArch32-NEXT:    [[TMP0:%.*]] = zext i16 [[B:%.*]] to i32
-// AArch32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.crc32ch(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.crc32ch(i32 [[A:%.*]], i32 [[TMP0]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP1]]
 //
 // AArch64-LABEL: @test_crc32ch(
 // AArch64-NEXT:  entry:
 // AArch64-NEXT:    [[TMP0:%.*]] = zext i16 [[B:%.*]] to i32
-// AArch64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32ch(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR3]]
+// AArch64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32ch(i32 [[A:%.*]], i32 [[TMP0]]) #[[ATTR3]]
 // AArch64-NEXT:    ret i32 [[TMP1]]
 //
 uint32_t test_crc32ch(uint32_t a, uint16_t b) {
@@ -1502,12 +1502,12 @@ uint32_t test_crc32ch(uint32_t a, uint16_t b) {
 
 // AArch32-LABEL: @test_crc32cw(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.crc32cw(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.crc32cw(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 // AArch64-LABEL: @test_crc32cw(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32cw(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR3]]
+// AArch64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32cw(i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]]
 // AArch64-NEXT:    ret i32 [[TMP0]]
 //
 uint32_t test_crc32cw(uint32_t a, uint32_t b) {
@@ -1519,13 +1519,13 @@ uint32_t test_crc32cw(uint32_t a, uint32_t b) {
 // AArch32-NEXT:    [[TMP0:%.*]] = trunc i64 [[B:%.*]] to i32
 // AArch32-NEXT:    [[TMP1:%.*]] = lshr i64 [[B]], 32
 // AArch32-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
-// AArch32-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.crc32cw(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR1]]
-// AArch32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.crc32cw(i32 [[TMP3]], i32 [[TMP2]]) [[ATTR1]]
+// AArch32-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.crc32cw(i32 [[A:%.*]], i32 [[TMP0]]) #[[ATTR1]]
+// AArch32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.crc32cw(i32 [[TMP3]], i32 [[TMP2]]) #[[ATTR1]]
 // AArch32-NEXT:    ret i32 [[TMP4]]
 //
 // AArch64-LABEL: @test_crc32cd(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32cx(i32 [[A:%.*]], i64 [[B:%.*]]) [[ATTR3]]
+// AArch64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32cx(i32 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR3]]
 // AArch64-NEXT:    ret i32 [[TMP0]]
 //
 uint32_t test_crc32cd(uint32_t a, uint64_t b) {
@@ -1535,12 +1535,12 @@ uint32_t test_crc32cd(uint32_t a, uint64_t b) {
 /* 10.1 Special register intrinsics */
 // AArch32-LABEL: @test_rsr(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.read_volatile_register.i32(metadata [[A32RSR32:!.*]])
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.read_volatile_register.i32(metadata [[META9:![0-9]+]])
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
 // AArch64-LABEL: @test_rsr(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[A64RSR:!.*]])
+// AArch64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[META8:![0-9]+]])
 // AArch64-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 // AArch64-NEXT:    ret i32 [[TMP1]]
 //
@@ -1554,12 +1554,12 @@ uint32_t test_rsr() {
 
 // AArch32-LABEL: @test_rsr64(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[A32RSR64:!.*]])
+// AArch32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[META10:![0-9]+]])
 // AArch32-NEXT:    ret i64 [[TMP0]]
 //
 // AArch64-LABEL: @test_rsr64(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[A64RSR]])
+// AArch64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[META8]])
 // AArch64-NEXT:    ret i64 [[TMP0]]
 //
 uint64_t test_rsr64() {
@@ -1572,13 +1572,13 @@ uint64_t test_rsr64() {
 
 // AArch32-LABEL: @test_rsrp(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.read_volatile_register.i32(metadata [[A32SYSREG:!.*]])
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.read_volatile_register.i32(metadata [[META11:![0-9]+]])
 // AArch32-NEXT:    [[TMP1:%.*]] = inttoptr i32 [[TMP0]] to i8*
 // AArch32-NEXT:    ret i8* [[TMP1]]
 //
 // AArch64-LABEL: @test_rsrp(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[A64SYSREG:!.*]])
+// AArch64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[META9:![0-9]+]])
 // AArch64-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to i8*
 // AArch64-NEXT:    ret i8* [[TMP1]]
 //
@@ -1588,13 +1588,13 @@ void *test_rsrp() {
 
 // AArch32-LABEL: @test_wsr(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    call void @llvm.write_register.i32(metadata [[A32RSR32]], i32 [[V:%.*]])
+// AArch32-NEXT:    call void @llvm.write_register.i32(metadata [[META9]], i32 [[V:%.*]])
 // AArch32-NEXT:    ret void
 //
 // AArch64-LABEL: @test_wsr(
 // AArch64-NEXT:  entry:
 // AArch64-NEXT:    [[TMP0:%.*]] = zext i32 [[V:%.*]] to i64
-// AArch64-NEXT:    call void @llvm.write_register.i64(metadata [[A64RSR]], i64 [[TMP0]])
+// AArch64-NEXT:    call void @llvm.write_register.i64(metadata [[META8]], i64 [[TMP0]])
 // AArch64-NEXT:    ret void
 //
 void test_wsr(uint32_t v) {
@@ -1607,12 +1607,12 @@ void test_wsr(uint32_t v) {
 
 // AArch32-LABEL: @test_wsr64(
 // AArch32-NEXT:  entry:
-// AArch32-NEXT:    call void @llvm.write_register.i64(metadata [[A32RSR64]], i64 [[V:%.*]])
+// AArch32-NEXT:    call void @llvm.write_register.i64(metadata [[META10]], i64 [[V:%.*]])
 // AArch32-NEXT:    ret void
 //
 // AArch64-LABEL: @test_wsr64(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    call void @llvm.write_register.i64(metadata [[A64RSR]], i64 [[V:%.*]])
+// AArch64-NEXT:    call void @llvm.write_register.i64(metadata [[META8]], i64 [[V:%.*]])
 // AArch64-NEXT:    ret void
 //
 void test_wsr64(uint64_t v) {
@@ -1626,13 +1626,13 @@ void test_wsr64(uint64_t v) {
 // AArch32-LABEL: @test_wsrp(
 // AArch32-NEXT:  entry:
 // AArch32-NEXT:    [[TMP0:%.*]] = ptrtoint i8* [[V:%.*]] to i32
-// AArch32-NEXT:    call void @llvm.write_register.i32(metadata [[A32SYSREG]], i32 [[TMP0]])
+// AArch32-NEXT:    call void @llvm.write_register.i32(metadata [[META11]], i32 [[TMP0]])
 // AArch32-NEXT:    ret void
 //
 // AArch64-LABEL: @test_wsrp(
 // AArch64-NEXT:  entry:
 // AArch64-NEXT:    [[TMP0:%.*]] = ptrtoint i8* [[V:%.*]] to i64
-// AArch64-NEXT:    call void @llvm.write_register.i64(metadata [[A64SYSREG]], i64 [[TMP0]])
+// AArch64-NEXT:    call void @llvm.write_register.i64(metadata [[META9]], i64 [[TMP0]])
 // AArch64-NEXT:    ret void
 //
 void test_wsrp(void *v) {
@@ -1642,7 +1642,7 @@ void test_wsrp(void *v) {
 // AArch32-LABEL: @test_rsrf(
 // AArch32-NEXT:  entry:
 // AArch32-NEXT:    [[REF_TMP:%.*]] = alloca i32, align 4
-// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.read_volatile_register.i32(metadata [[A32RSR32]])
+// AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.read_volatile_register.i32(metadata [[META9]])
 // AArch32-NEXT:    store i32 [[TMP0]], i32* [[REF_TMP]], align 4
 // AArch32-NEXT:    [[TMP1:%.*]] = bitcast i32* [[REF_TMP]] to float*
 // AArch32-NEXT:    [[TMP2:%.*]] = load float, float* [[TMP1]], align 4
@@ -1651,7 +1651,7 @@ void test_wsrp(void *v) {
 // AArch64-LABEL: @test_rsrf(
 // AArch64-NEXT:  entry:
 // AArch64-NEXT:    [[REF_TMP:%.*]] = alloca i32, align 4
-// AArch64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[A64RSR]])
+// AArch64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[META8]])
 // AArch64-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 // AArch64-NEXT:    store i32 [[TMP1]], i32* [[REF_TMP]], align 4
 // AArch64-NEXT:    [[TMP2:%.*]] = bitcast i32* [[REF_TMP]] to float*
@@ -1669,7 +1669,7 @@ float test_rsrf() {
 // AArch32-LABEL: @test_rsrf64(
 // AArch32-NEXT:  entry:
 // AArch32-NEXT:    [[REF_TMP:%.*]] = alloca i64, align 8
-// AArch32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[A32RSR64]])
+// AArch32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[META10]])
 // AArch32-NEXT:    store i64 [[TMP0]], i64* [[REF_TMP]], align 8
 // AArch32-NEXT:    [[TMP1:%.*]] = bitcast i64* [[REF_TMP]] to double*
 // AArch32-NEXT:    [[TMP2:%.*]] = load double, double* [[TMP1]], align 8
@@ -1678,7 +1678,7 @@ float test_rsrf() {
 // AArch64-LABEL: @test_rsrf64(
 // AArch64-NEXT:  entry:
 // AArch64-NEXT:    [[REF_TMP:%.*]] = alloca i64, align 8
-// AArch64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[A64RSR]])
+// AArch64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[META8]])
 // AArch64-NEXT:    store i64 [[TMP0]], i64* [[REF_TMP]], align 8
 // AArch64-NEXT:    [[TMP1:%.*]] = bitcast i64* [[REF_TMP]] to double*
 // AArch64-NEXT:    [[TMP2:%.*]] = load double, double* [[TMP1]], align 8
@@ -1698,7 +1698,7 @@ double test_rsrf64() {
 // AArch32-NEXT:    store float [[V:%.*]], float* [[V_ADDR]], align 4
 // AArch32-NEXT:    [[TMP0:%.*]] = bitcast float* [[V_ADDR]] to i32*
 // AArch32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
-// AArch32-NEXT:    call void @llvm.write_register.i32(metadata [[A32RSR32]], i32 [[TMP1]])
+// AArch32-NEXT:    call void @llvm.write_register.i32(metadata [[META9]], i32 [[TMP1]])
 // AArch32-NEXT:    ret void
 //
 // AArch64-LABEL: @test_wsrf(
@@ -1708,7 +1708,7 @@ double test_rsrf64() {
 // AArch64-NEXT:    [[TMP0:%.*]] = bitcast float* [[V_ADDR]] to i32*
 // AArch64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
 // AArch64-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
-// AArch64-NEXT:    call void @llvm.write_register.i64(metadata [[A64RSR]], i64 [[TMP2]])
+// AArch64-NEXT:    call void @llvm.write_register.i64(metadata [[META8]], i64 [[TMP2]])
 // AArch64-NEXT:    ret void
 //
 void test_wsrf(float v) {
@@ -1725,7 +1725,7 @@ void test_wsrf(float v) {
 // AArch32-NEXT:    store double [[V:%.*]], double* [[V_ADDR]], align 8
 // AArch32-NEXT:    [[TMP0:%.*]] = bitcast double* [[V_ADDR]] to i64*
 // AArch32-NEXT:    [[TMP1:%.*]] = load i64, i64* [[TMP0]], align 8
-// AArch32-NEXT:    call void @llvm.write_register.i64(metadata [[A32RSR64]], i64 [[TMP1]])
+// AArch32-NEXT:    call void @llvm.write_register.i64(metadata [[META10]], i64 [[TMP1]])
 // AArch32-NEXT:    ret void
 //
 // AArch64-LABEL: @test_wsrf64(
@@ -1734,7 +1734,7 @@ void test_wsrf(float v) {
 // AArch64-NEXT:    store double [[V:%.*]], double* [[V_ADDR]], align 8
 // AArch64-NEXT:    [[TMP0:%.*]] = bitcast double* [[V_ADDR]] to i64*
 // AArch64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[TMP0]], align 8
-// AArch64-NEXT:    call void @llvm.write_register.i64(metadata [[A64RSR]], i64 [[TMP1]])
+// AArch64-NEXT:    call void @llvm.write_register.i64(metadata [[META8]], i64 [[TMP1]])
 // AArch64-NEXT:    ret void
 //
 void test_wsrf64(double v) {
@@ -1748,7 +1748,7 @@ void test_wsrf64(double v) {
 #ifdef __ARM_64BIT_STATE
 // AArch6483-LABEL: @test_jcvt(
 // AArch6483-NEXT:  entry:
-// AArch6483-NEXT:    [[TMP0:%.*]] = call i32 @llvm.aarch64.fjcvtzs(double [[V:%.*]]) [[ATTR3:#.*]]
+// AArch6483-NEXT:    [[TMP0:%.*]] = call i32 @llvm.aarch64.fjcvtzs(double [[V:%.*]]) #[[ATTR3]]
 // AArch6483-NEXT:    ret i32 [[TMP0]]
 //
 int32_t test_jcvt(double v) {
@@ -1759,36 +1759,31 @@ int32_t test_jcvt(double v) {
 
 #if __ARM_64BIT_STATE && defined(__ARM_FEATURE_RNG)
 
-// AArch6485-LABEL: @test_rndr(
-// AArch6485-NEXT:  entry:
-// AArch6485-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.aarch64.rndr() [[ATTR3:#.*]]
-// AArch6485-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
-// AArch6485-NEXT:    [[TMP2:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
-// AArch6485-NEXT:    store i64 [[TMP1]], i64* [[__ADDR:%.*]], align 8
-// AArch6485-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
-// AArch6485-NEXT:    ret i32 [[TMP3]]
+// AArch64-LABEL: @test_rndr(
+// AArch64-NEXT:  entry:
+// AArch64-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.aarch64.rndr() #[[ATTR3]]
+// AArch64-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
+// AArch64-NEXT:    [[TMP2:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
+// AArch64-NEXT:    store i64 [[TMP1]], i64* [[__ADDR:%.*]], align 8
+// AArch64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+// AArch64-NEXT:    ret i32 [[TMP3]]
 //
 int test_rndr(uint64_t *__addr) {
   return __rndr(__addr);
 }
 
-// AArch6485-LABEL: @test_rndrrs(
-// AArch6485-NEXT:  entry:
-// AArch6485-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.aarch64.rndrrs() [[ATTR3:#.*]]
-// AArch6485-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
-// AArch6485-NEXT:    [[TMP2:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
-// AArch6485-NEXT:    store i64 [[TMP1]], i64* [[__ADDR:%.*]], align 8
-// AArch6485-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
-// AArch6485-NEXT:    ret i32 [[TMP3]]
+// AArch64-LABEL: @test_rndrrs(
+// AArch64-NEXT:  entry:
+// AArch64-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.aarch64.rndrrs() #[[ATTR3]]
+// AArch64-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
+// AArch64-NEXT:    [[TMP2:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
+// AArch64-NEXT:    store i64 [[TMP1]], i64* [[__ADDR:%.*]], align 8
+// AArch64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+// AArch64-NEXT:    ret i32 [[TMP3]]
 //
 int test_rndrrs(uint64_t *__addr) {
   return __rndrrs(__addr);
 }
 #endif
 
-// AArch32: [[A32RSR32]] = !{!"cp1:2:c3:c4:5"}
-// AArch32: [[A32RSR64]] = !{!"cp1:2:c3"}
-// AArch32: [[A32SYSREG]] = !{!"sysreg"}
 
-// AArch64: [[A64RSR]] = !{!"1:2:3:4:5"}
-// AArch64: [[A64SYSREG]] = !{!"sysreg"}

diff  --git a/clang/test/CodeGen/memcpy-inline-builtin.c b/clang/test/CodeGen/memcpy-inline-builtin.c
index 4873bdb5a566..050f9a8ba871 100644
--- a/clang/test/CodeGen/memcpy-inline-builtin.c
+++ b/clang/test/CodeGen/memcpy-inline-builtin.c
@@ -32,11 +32,11 @@ AVAILABLE_EXTERNALLY void *memcpy(void *a, const void *b, size_t c) {
 // CHECK-NEXT:    store i8* [[TMP0]], i8** [[A_ADDR_I]], align 8
 // CHECK-NEXT:    store i8* [[TMP1]], i8** [[B_ADDR_I]], align 8
 // CHECK-NEXT:    store i64 [[TMP2]], i64* [[C_ADDR_I]], align 8
-// CHECK-NEXT:    call void asm sideeffect "# memcpy.inline marker", "~{dirflag},~{fpsr},~{flags}"() #[[ATTR4:[0-9]+]], !srcloc !2
+// CHECK-NEXT:    call void asm sideeffect "# memcpy.inline marker", "~{dirflag},~{fpsr},~{flags}"() #[[ATTR3:[0-9]+]], !srcloc !2
 // CHECK-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[A_ADDR_I]], align 8
 // CHECK-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[B_ADDR_I]], align 8
 // CHECK-NEXT:    [[TMP5:%.*]] = load i64, i64* [[C_ADDR_I]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[TMP3]], i8* align 1 [[TMP4]], i64 [[TMP5]], i1 false) #[[ATTR4]]
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[TMP3]], i8* align 1 [[TMP4]], i64 [[TMP5]], i1 false) #[[ATTR3]]
 // CHECK-NEXT:    ret i8* [[TMP3]]
 //
 void *foo(void *a, const void *b, size_t c) {

diff  --git a/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp b/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp
index a07036bf833c..b73bdf48b4ee 100644
--- a/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp
@@ -649,25 +649,25 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 [[CONV3_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND_I:%.*]]
 // CHECK1:       omp.inner.for.cond.i:
-// CHECK1-NEXT:    [[TMP46:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group !15
+// CHECK1-NEXT:    [[TMP46:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15:![0-9]+]]
 // CHECK1-NEXT:    [[CONV4_I:%.*]] = sext i32 [[TMP46]] to i64
-// CHECK1-NEXT:    [[TMP47:%.*]] = load i64, i64* [[DOTUB__ADDR_I]], align 8, !noalias !14, !llvm.access.group !15
+// CHECK1-NEXT:    [[TMP47:%.*]] = load i64, i64* [[DOTUB__ADDR_I]], align 8, !noalias !14, !llvm.access.group [[ACC_GRP15]]
 // CHECK1-NEXT:    [[CMP_I:%.*]] = icmp ule i64 [[CONV4_I]], [[TMP47]]
 // CHECK1-NEXT:    br i1 [[CMP_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__9_EXIT:%.*]]
 // CHECK1:       omp.inner.for.body.i:
-// CHECK1-NEXT:    [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group !15
-// CHECK1-NEXT:    store i32 [[TMP48]], i32* [[I_I]], align 4, !noalias !14, !llvm.access.group !15
-// CHECK1-NEXT:    [[TMP49:%.*]] = load i32, i32* [[CONV_I]], align 4, !llvm.access.group !15
+// CHECK1-NEXT:    [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// CHECK1-NEXT:    store i32 [[TMP48]], i32* [[I_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// CHECK1-NEXT:    [[TMP49:%.*]] = load i32, i32* [[CONV_I]], align 4, !llvm.access.group [[ACC_GRP15]]
 // CHECK1-NEXT:    [[IDXPROM_I:%.*]] = sext i32 [[TMP49]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds i16, i16* [[CONV2_I]], i64 [[IDXPROM_I]]
-// CHECK1-NEXT:    [[TMP50:%.*]] = load i16, i16* [[ARRAYIDX_I]], align 2, !llvm.access.group !15
+// CHECK1-NEXT:    [[TMP50:%.*]] = load i16, i16* [[ARRAYIDX_I]], align 2, !llvm.access.group [[ACC_GRP15]]
 // CHECK1-NEXT:    [[CONV5_I:%.*]] = sext i16 [[TMP50]] to i32
-// CHECK1-NEXT:    [[TMP51:%.*]] = load i32, i32* [[CONV_I]], align 4, !llvm.access.group !15
+// CHECK1-NEXT:    [[TMP51:%.*]] = load i32, i32* [[CONV_I]], align 4, !llvm.access.group [[ACC_GRP15]]
 // CHECK1-NEXT:    [[ADD6_I:%.*]] = add nsw i32 [[TMP51]], [[CONV5_I]]
-// CHECK1-NEXT:    store i32 [[ADD6_I]], i32* [[CONV_I]], align 4, !llvm.access.group !15
-// CHECK1-NEXT:    [[TMP52:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group !15
+// CHECK1-NEXT:    store i32 [[ADD6_I]], i32* [[CONV_I]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK1-NEXT:    [[TMP52:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
 // CHECK1-NEXT:    [[ADD7_I:%.*]] = add nsw i32 [[TMP52]], 1
-// CHECK1-NEXT:    store i32 [[ADD7_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group !15
+// CHECK1-NEXT:    store i32 [[ADD7_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND_I]], !llvm.loop [[LOOP16:![0-9]+]]
 // CHECK1:       .omp_outlined..9.exit:
 // CHECK1-NEXT:    ret i32 0
@@ -761,31 +761,31 @@ int main(int argc, char **argv) {
 // CHECK3-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !2
+// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2:![0-9]+]]
 // CHECK3-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP4]] to i64
-// CHECK3-NEXT:    [[TMP5:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8, !llvm.access.group !2
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP2]]
 // CHECK3-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP5]]
 // CHECK3-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !2
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK3-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group !2
-// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, i32* [[A]], align 4, !llvm.access.group !2
+// CHECK3-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP2]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, i32* [[A]], align 4, !llvm.access.group [[ACC_GRP2]]
 // CHECK3-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP7]] to i64
 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i64 [[IDXPROM]]
-// CHECK3-NEXT:    [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX]], align 2, !llvm.access.group !2
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP2]]
 // CHECK3-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP8]] to i32
-// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A]], align 4, !llvm.access.group !2
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A]], align 4, !llvm.access.group [[ACC_GRP2]]
 // CHECK3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], [[CONV3]]
-// CHECK3-NEXT:    store i32 [[ADD4]], i32* [[A]], align 4, !llvm.access.group !2
+// CHECK3-NEXT:    store i32 [[ADD4]], i32* [[A]], align 4, !llvm.access.group [[ACC_GRP2]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !2
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]]
 // CHECK3-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK3-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group !2
+// CHECK3-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]]
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    store i32 5, i32* [[I]], align 4

diff  --git a/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
index cecdf44337fb..4b0d8d756c93 100644
--- a/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
@@ -449,7 +449,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP36:%.*]] = sdiv exact i64 [[TMP35]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
 // CHECK1-NEXT:    [[TMP37:%.*]] = add nuw i64 [[TMP36]], 1
 // CHECK1-NEXT:    [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
-// CHECK1-NEXT:    store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8
+// CHECK1-NEXT:    store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2

diff  --git a/clang/test/OpenMP/sections_reduction_task_codegen.cpp b/clang/test/OpenMP/sections_reduction_task_codegen.cpp
index 93b0d42c4cd1..658456951d22 100644
--- a/clang/test/OpenMP/sections_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/sections_reduction_task_codegen.cpp
@@ -487,7 +487,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP36:%.*]] = sdiv exact i64 [[TMP35]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
 // CHECK1-NEXT:    [[TMP37:%.*]] = add nuw i64 [[TMP36]], 1
 // CHECK1-NEXT:    [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
-// CHECK1-NEXT:    store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8
+// CHECK1-NEXT:    store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2

diff  --git a/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp b/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
index a6aef3533b99..b4773114c3bd 100644
--- a/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
@@ -453,7 +453,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP36:%.*]] = sdiv exact i64 [[TMP35]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
 // CHECK1-NEXT:    [[TMP37:%.*]] = add nuw i64 [[TMP36]], 1
 // CHECK1-NEXT:    [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
-// CHECK1-NEXT:    store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8
+// CHECK1-NEXT:    store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2

diff  --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
index 61a1cf505942..2bcebc0f24e7 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
@@ -853,7 +853,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP36:%.*]] = sdiv exact i64 [[TMP35]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
 // CHECK1-NEXT:    [[TMP37:%.*]] = add nuw i64 [[TMP36]], 1
 // CHECK1-NEXT:    [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
-// CHECK1-NEXT:    store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8
+// CHECK1-NEXT:    store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2

diff  --git a/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp
index db8278eb3721..7dc0fc9774ae 100644
--- a/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp
@@ -642,25 +642,25 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 [[CONV3_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND_I:%.*]]
 // CHECK1:       omp.inner.for.cond.i:
-// CHECK1-NEXT:    [[TMP46:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group !15
+// CHECK1-NEXT:    [[TMP46:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15:![0-9]+]]
 // CHECK1-NEXT:    [[CONV4_I:%.*]] = sext i32 [[TMP46]] to i64
-// CHECK1-NEXT:    [[TMP47:%.*]] = load i64, i64* [[DOTUB__ADDR_I]], align 8, !noalias !14, !llvm.access.group !15
+// CHECK1-NEXT:    [[TMP47:%.*]] = load i64, i64* [[DOTUB__ADDR_I]], align 8, !noalias !14, !llvm.access.group [[ACC_GRP15]]
 // CHECK1-NEXT:    [[CMP_I:%.*]] = icmp ule i64 [[CONV4_I]], [[TMP47]]
 // CHECK1-NEXT:    br i1 [[CMP_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__9_EXIT:%.*]]
 // CHECK1:       omp.inner.for.body.i:
-// CHECK1-NEXT:    [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group !15
-// CHECK1-NEXT:    store i32 [[TMP48]], i32* [[I_I]], align 4, !noalias !14, !llvm.access.group !15
-// CHECK1-NEXT:    [[TMP49:%.*]] = load i32, i32* [[CONV_I]], align 4, !llvm.access.group !15
+// CHECK1-NEXT:    [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// CHECK1-NEXT:    store i32 [[TMP48]], i32* [[I_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
+// CHECK1-NEXT:    [[TMP49:%.*]] = load i32, i32* [[CONV_I]], align 4, !llvm.access.group [[ACC_GRP15]]
 // CHECK1-NEXT:    [[IDXPROM_I:%.*]] = sext i32 [[TMP49]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds i16, i16* [[CONV2_I]], i64 [[IDXPROM_I]]
-// CHECK1-NEXT:    [[TMP50:%.*]] = load i16, i16* [[ARRAYIDX_I]], align 2, !llvm.access.group !15
+// CHECK1-NEXT:    [[TMP50:%.*]] = load i16, i16* [[ARRAYIDX_I]], align 2, !llvm.access.group [[ACC_GRP15]]
 // CHECK1-NEXT:    [[CONV5_I:%.*]] = sext i16 [[TMP50]] to i32
-// CHECK1-NEXT:    [[TMP51:%.*]] = load i32, i32* [[CONV_I]], align 4, !llvm.access.group !15
+// CHECK1-NEXT:    [[TMP51:%.*]] = load i32, i32* [[CONV_I]], align 4, !llvm.access.group [[ACC_GRP15]]
 // CHECK1-NEXT:    [[ADD6_I:%.*]] = add nsw i32 [[TMP51]], [[CONV5_I]]
-// CHECK1-NEXT:    store i32 [[ADD6_I]], i32* [[CONV_I]], align 4, !llvm.access.group !15
-// CHECK1-NEXT:    [[TMP52:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group !15
+// CHECK1-NEXT:    store i32 [[ADD6_I]], i32* [[CONV_I]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK1-NEXT:    [[TMP52:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
 // CHECK1-NEXT:    [[ADD7_I:%.*]] = add nsw i32 [[TMP52]], 1
-// CHECK1-NEXT:    store i32 [[ADD7_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group !15
+// CHECK1-NEXT:    store i32 [[ADD7_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND_I]], !llvm.loop [[LOOP16:![0-9]+]]
 // CHECK1:       .omp_outlined..9.exit:
 // CHECK1-NEXT:    ret i32 0
@@ -754,31 +754,31 @@ int main(int argc, char **argv) {
 // CHECK3-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !2
+// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2:![0-9]+]]
 // CHECK3-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP4]] to i64
-// CHECK3-NEXT:    [[TMP5:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8, !llvm.access.group !2
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP2]]
 // CHECK3-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP5]]
 // CHECK3-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !2
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK3-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group !2
-// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, i32* [[A]], align 4, !llvm.access.group !2
+// CHECK3-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP2]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, i32* [[A]], align 4, !llvm.access.group [[ACC_GRP2]]
 // CHECK3-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP7]] to i64
 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[VLA]], i64 [[IDXPROM]]
-// CHECK3-NEXT:    [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX]], align 2, !llvm.access.group !2
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP2]]
 // CHECK3-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP8]] to i32
-// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A]], align 4, !llvm.access.group !2
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A]], align 4, !llvm.access.group [[ACC_GRP2]]
 // CHECK3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], [[CONV3]]
-// CHECK3-NEXT:    store i32 [[ADD4]], i32* [[A]], align 4, !llvm.access.group !2
+// CHECK3-NEXT:    store i32 [[ADD4]], i32* [[A]], align 4, !llvm.access.group [[ACC_GRP2]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group !2
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]]
 // CHECK3-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK3-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group !2
+// CHECK3-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]]
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    store i32 5, i32* [[I]], align 4


        


More information about the cfe-commits mailing list